diff --git a/ANALYSIS_VIEW_MOCKUP.md b/ANALYSIS_VIEW_MOCKUP.md index 679164318..3eb7c5f94 100644 --- a/ANALYSIS_VIEW_MOCKUP.md +++ b/ANALYSIS_VIEW_MOCKUP.md @@ -99,7 +99,7 @@ ISSUES: 172 ## 5. Issues Categorized -``` +```` ------------------------------------------ Unused Imports: 134 (7.1%) / 1,876 List Filenames and imports: @@ -138,7 +138,7 @@ Location and codeblocks: else: result[key] = value return result - ``` +```` 2. src/core/processor.py (lines 156-172) and src/api/handlers.py (lines 78-94) ```python @@ -151,7 +151,9 @@ Location and codeblocks: raise ValueError(f"Missing required field: {field}") return True ``` ------------- + +______________________________________________________________________ + Issues with Function Call in/out points: 72 (From Call site tracking and Function call relationships) @@ -159,65 +161,77 @@ Issues with Function Call in/out points: 72 - src/utils/validator.py/validate_input - Missing required parameters in 3 locations - src/api/routes.py/register_routes - Incorrect return value handling in 4 locations - src/models/user.py/User.from_dict - Passing non-dict values in 2 locations ------------- + +______________________________________________________________________ + Input/output parameter analysis Valid 1726 parameters Issues 11 parameters: + 1. /src/core/processor.py/Processor/process_data/options - Type mismatch (expected Dict, received List) -2. /src/api/routes.py/create_user/user_data - Missing validation for required fields -3. /src/utils/formatter.py/format_output/data - Null value passed without null check -4. /src/models/transaction.py/Transaction/validate/amount - Negative values not handled -5. /src/core/analyzer.py/analyze_code/filepath - Non-existent file paths not handled ------------- +1. /src/api/routes.py/create_user/user_data - Missing validation for required fields +1. /src/utils/formatter.py/format_output/data - Null value passed without null check +1. /src/models/transaction.py/Transaction/validate/amount - Negative values not handled +1. /src/core/analyzer.py/analyze_code/filepath - Non-existent file paths not handled + +______________________________________________________________________ + Interface implementation verification: Valid: 71 components Issues: 6 components: + 1. src/models/user.tsx - UserComponent doesn't implement all required UserProps -2. src/components/form.tsx - FormInput missing required onChange handler -3. src/views/dashboard.tsx - DashboardView implements deprecated IDashboard interface -4. src/api/client.ts - ApiClient missing required error handling methods -5. src/utils/formatter.tsx - DataFormatter missing required format method -6. src/components/table.tsx - TableComponent not implementing required sorting functionality +1. src/components/form.tsx - FormInput missing required onChange handler +1. src/views/dashboard.tsx - DashboardView implements deprecated IDashboard interface +1. src/api/client.ts - ApiClient missing required error handling methods +1. src/utils/formatter.tsx - DataFormatter missing required format method +1. src/components/table.tsx - TableComponent not implementing required sorting functionality + ``` ## 6. Visualization Types ``` + ALL VISUALIZATION TYPES Selection 1- type (Example - hierarchy, dependency) after selecting 1-> 2nd selection is corresponding parameter (example -call hierarchy, symbol hierarchy, Inheritance hierarchy) Again- corresponding parameter selection (if applicable)- (For example - codefile / class / method) Hierarchy Visualizations: - - Call hierarchy visualization - - By file: src/core/processor.py - - By class: Processor - - By method: process_data - - Symbol hierarchy visualization - - By module: src/core - - By file: src/core/processor.py - - By class: Processor - - Inheritance hierarchy visualization - - By class: BaseProcessor + +- Call hierarchy visualization + - By file: src/core/processor.py + - By class: Processor + - By method: process_data +- Symbol hierarchy visualization + - By module: src/core + - By file: src/core/processor.py + - By class: Processor +- Inheritance hierarchy visualization + - By class: BaseProcessor Dependency Visualizations: - - Module dependency visualization - - By module: src/core - - By file: src/core/processor.py - - Symbol dependency visualization - - By class: Processor - - By function: process_data + +- Module dependency visualization + - By module: src/core + - By file: src/core/processor.py +- Symbol dependency visualization + - By class: Processor + - By function: process_data Flow Visualizations: - - Function call visualization - - By function: process_data - - By class method: Processor.validate_input - - Package structure visualization - - Full project - - By module: src/core - - Variable usage tracking - - By variable: config - - By class attribute: Processor.options + +- Function call visualization + - By function: process_data + - By class method: Processor.validate_input +- Package structure visualization + - Full project + - By module: src/core +- Variable usage tracking + - By variable: config + - By class attribute: Processor.options + ``` ## Suggested Views @@ -279,4 +293,4 @@ For the Full Specific Issues View, I recommend a detailed, filterable list of al - Specific, actionable steps to resolve each issue - Code examples of proper implementation - Links to relevant documentation or best practices - +``` diff --git a/codegen-examples/examples/snapshot_event_handler/README.md b/codegen-examples/examples/snapshot_event_handler/README.md index 8899580e1..447d154cd 100644 --- a/codegen-examples/examples/snapshot_event_handler/README.md +++ b/codegen-examples/examples/snapshot_event_handler/README.md @@ -1,6 +1,6 @@ # Event Handler with codebase snapshotting -This project is designed to using Modal snapshotting to provide parsed codebase instances with minimal latency, make it more manageable to write event based handlers. +This project is designed to using Modal snapshotting to provide parsed codebase instances with minimal latency, make it more manageable to write event based handlers. Follow the instructions below to set up and deploy the application. @@ -9,7 +9,7 @@ Follow the instructions below to set up and deploy the application. Before you begin, ensure you have the following installed and configured: 1. **uv**: A tool for managing virtual environments and syncing dependencies. -2. **Modal**: Ensure you have Modal configured on your system. +1. **Modal**: Ensure you have Modal configured on your system. ## Setup Instructions @@ -23,7 +23,7 @@ Before you begin, ensure you have the following installed and configured: source ./venv/bin/activate ``` -2. **Sync Dependencies** +1. **Sync Dependencies** Sync the project dependencies using `uv`: @@ -31,7 +31,7 @@ Before you begin, ensure you have the following installed and configured: uv sync ``` -3. **Deploy to Modal** +1. **Deploy to Modal** Deploy the application to Modal by running: @@ -48,7 +48,6 @@ Before you begin, ensure you have the following installed and configured: - `.env.template` and `.env`: Environment variable templates and configurations. - `pyproject.toml`: Project configuration and dependencies. - ## Integration -Once deployed, you can use the deployed web_url as the webhook endpoint for your slack, linear, or github webhooks. \ No newline at end of file +Once deployed, you can use the deployed web_url as the webhook endpoint for your slack, linear, or github webhooks. diff --git a/codegen-examples/examples/swebench_agent_run/local_run.ipynb b/codegen-examples/examples/swebench_agent_run/local_run.ipynb index f2f73c922..237732bbf 100644 --- a/codegen-examples/examples/swebench_agent_run/local_run.ipynb +++ b/codegen-examples/examples/swebench_agent_run/local_run.ipynb @@ -32,7 +32,14 @@ "metadata": {}, "outputs": [], "source": [ - "await run_eval(use_existing_preds=None, dataset=\"lite\", length=5, repo=\"django/django\", num_workers=10, model=\"claude-3-7-sonnet-latest\")" + "await run_eval(\n", + " use_existing_preds=None,\n", + " dataset=\"lite\",\n", + " length=5,\n", + " repo=\"django/django\",\n", + " num_workers=10,\n", + " model=\"claude-3-7-sonnet-latest\",\n", + ")" ] }, { @@ -76,7 +83,12 @@ "source": [ "from codegen.agents.code_agent import CodeAgent\n", "\n", - "agent = CodeAgent(codebase=codebase, tags=[\"local_test\"], model_name=\"claude-3-5-sonnet-latest\", model_provider=\"anthropic\")" + "agent = CodeAgent(\n", + " codebase=codebase,\n", + " tags=[\"local_test\"],\n", + " model_name=\"claude-3-5-sonnet-latest\",\n", + " model_provider=\"anthropic\",\n", + ")" ] }, { diff --git a/codegen-on-oss/README.md b/codegen-on-oss/README.md index a7700eb77..50062a18d 100644 --- a/codegen-on-oss/README.md +++ b/codegen-on-oss/README.md @@ -1,18 +1,144 @@ -# Overview +# Codegen on OSS -The **Codegen on OSS** package provides a modular pipeline that: +The **Codegen on OSS** package provides a comprehensive toolkit for codebase analysis, context management, and agent execution. It integrates the functionality from the original harness.py with enhanced capabilities for codebase analysis and context saving. -- **Collects repository URLs** from different sources (e.g., CSV files or GitHub searches). -- **Parses repositories** using the codegen tool. -- **Profiles performance** and logs metrics for each parsing run. -- **Logs errors** to help pinpoint parsing failures or performance bottlenecks. +## New Features -______________________________________________________________________ +### 1. Comprehensive Codebase Analysis + +The package now includes powerful tools for analyzing codebases: + +- **CodebaseAnalysisHarness**: Integrates core functionality from harness.py to provide detailed codebase analysis +- **Diff Generation**: Track file changes and generate diffs between commits +- **File Statistics**: Get comprehensive metrics about files, classes, functions, and their relationships + +### 2. Context Saving and Retrieval + +Save and restore codebase state for later use: + +- **CodebaseContextSnapshot**: Save and restore codebase state and analysis results +- **S3 Integration**: Store snapshots in S3-compatible storage via BucketStore +- **Local Storage**: Save snapshots locally for easy access + +### 3. CodeContextRetrievalServer + +A FastAPI server that provides a REST API for accessing all functionality: + +- **Repository Analysis**: Analyze repositories and get detailed metrics +- **Snapshot Management**: Create, list, and load snapshots +- **Agent Execution**: Run AI agents with saved context for code analysis and modification + +## Getting Started + +### Installation + +```bash +pip install codegen-on-oss +``` + +### Basic Usage + +#### Analyzing a Repository + +```python +from codegen_on_oss.analysis import CodebaseAnalysisHarness +from codegen_on_oss.snapshot import CodebaseContextSnapshot + +# Create a harness and analyze a codebase +harness = CodebaseAnalysisHarness.from_repo("owner/repo") +results = harness.analyze_codebase() + +# Save the state for later +snapshot = CodebaseContextSnapshot(harness) +snapshot_id = snapshot.create_snapshot() +``` + +#### Starting the Server + +```bash +cgparse serve --host 0.0.0.0 --port 8000 +``` ## Package Structure The package is composed of several modules: +- **analysis**: Codebase analysis tools + + - `harness_integration.py`: Integration of harness.py functionality + - Other analysis modules for parsing and metrics + +- **snapshot**: Context saving and retrieval + + - `context_snapshot.py`: Save and restore codebase state + - Other snapshot-related modules + +- **context_server**: FastAPI server for accessing functionality + + - `server.py`: REST API for codebase analysis and context management + +- **sources**: Repository source definitions + + - Defines the Repository source classes and settings + +- **cli**: Command-line interface + + - Built with Click, provides commands for parsing, analysis, and serving + +## CLI Commands + +The CLI provides several commands: + +- `run-one`: Parse a single repository specified by URL +- `run`: Iterate over repositories from a source and parse each one +- `serve`: Start the CodeContextRetrievalServer + +### Example: Starting the Server + +```bash +cgparse serve --host 0.0.0.0 --port 8000 +``` + +### Example: Analyzing a Repository + +```bash +cgparse run-one https://github.com/owner/repo +``` + +## API Endpoints + +When running the server, the following endpoints are available: + +- `/analyze/repository`: Analyze a repository and return results +- `/analyze/file_stats`: Get file statistics for an analyzed repository +- `/snapshot/create`: Create a snapshot of the current state +- `/snapshot/list`: List all available snapshots +- `/snapshot/load/{snapshot_id}`: Load a snapshot by ID +- `/agent/run`: Run an agent on the codebase + +## Example Scripts + +Check the `scripts` directory for example usage: + +- `example_usage.py`: Demonstrates how to use the CodebaseAnalysisHarness and CodebaseContextSnapshot + +## Original Functionality + +The package still includes all the original functionality: + +- **Repository Sources**: Collect repository URLs from different sources +- **Repository Parsing**: Parse repositories using the codegen tool +- **Performance Profiling**: Log metrics for each parsing run +- **Error Logging**: Log errors to help pinpoint parsing failures + +For more details on the original functionality, see the "Package Structure" and "Getting Started" sections below. + +______________________________________________________________________ + +## Original Package Structure + +The package is composed of several modules: + - `sources` - Defines the Repository source classes and settings. Settings are all configurable via environment variables @@ -186,152 +312,3 @@ There is a Dockerfile that can be used to create an image capable of running the **Input & Configuration** Explore a better CLI for providing options to the Modal run. - -______________________________________________________________________ - -## Example Log Output - -```shell -[codegen-on-oss*] codegen/codegen-on-oss/$ uv run cgparse run --source csv - 21:32:36 INFO Cloning repository https://github.com/JohnSnowLabs/spark-nlp.git - 21:36:57 INFO { - "profile_name": "https://github.com/JohnSnowLabs/spark-nlp.git", - "step": "codebase_init", - "delta_time": 7.186550649999845, - "cumulative_time": 7.186550649999845, - "cpu_time": 180.3553702, - "memory_usage": 567525376, - "memory_delta": 317095936, - "error": null -} - 21:36:58 INFO { - "profile_name": "https://github.com/JohnSnowLabs/spark-nlp.git", - "step": "post_init_validation", - "delta_time": 0.5465090990001045, - "cumulative_time": 7.733059748999949, - "cpu_time": 180.9174761, - "memory_usage": 569249792, - "memory_delta": 1724416, - "error": null -} - 21:36:58 ERROR Repository: https://github.com/JohnSnowLabs/spark-nlp.git -Traceback (most recent call last): - - File "/home/codegen/codegen/codegen-on-oss/.venv/bin/cgparse", line 10, in - sys.exit(cli()) - │ │ └ - │ └ - └ - File "/home/codegen/codegen/codegen-on-oss/.venv/lib/python3.12/site-packages/click/core.py", line 1161, in __call__ - return self.main(*args, **kwargs) - │ │ │ └ {} - │ │ └ () - │ └ - └ - File "/home/codegen/codegen/codegen-on-oss/.venv/lib/python3.12/site-packages/click/core.py", line 1082, in main - rv = self.invoke(ctx) - │ │ └ - │ └ - └ - File "/home/codegen/codegen/codegen-on-oss/.venv/lib/python3.12/site-packages/click/core.py", line 1697, in invoke - return _process_result(sub_ctx.command.invoke(sub_ctx)) - │ │ │ │ └ - │ │ │ └ - │ │ └ - │ └ - └ ._process_result at 0x7f466597fb00> - File "/home/codegen/codegen/codegen-on-oss/.venv/lib/python3.12/site-packages/click/core.py", line 1443, in invoke - return ctx.invoke(self.callback, **ctx.params) - │ │ │ │ │ └ {'source': 'csv', 'output_path': 'metrics.csv', 'error_output_path': 'errors.log', 'cache_dir': PosixPath('/home/.cache... - │ │ │ │ └ - │ │ │ └ - │ │ └ - │ └ - └ - File "/home/codegen/codegen/codegen-on-oss/.venv/lib/python3.12/site-packages/click/core.py", line 788, in invoke - return __callback(*args, **kwargs) - │ └ {'source': 'csv', 'output_path': 'metrics.csv', 'error_output_path': 'errors.log', 'cache_dir': PosixPath('/home/.cache... - └ () - - File "/home/codegen/codegen/codegen-on-oss/codegen_on_oss/cli.py", line 121, in run - parser.parse(repo_url) - │ │ └ 'https://github.com/JohnSnowLabs/spark-nlp.git' - │ └ - └ - - File "/home/codegen/codegen/codegen-on-oss/codegen_on_oss/parser.py", line 52, in parse - with self.metrics_profiler.start_profiler( - │ │ └ - │ └ - └ - - File "/home/.local/share/uv/python/cpython-3.12.6-linux-x86_64-gnu/lib/python3.12/contextlib.py", line 158, in __exit__ - self.gen.throw(value) - │ │ │ └ ParseRunError() - │ │ └ - │ └ - └ - -> File "/home/codegen/codegen/codegen-on-oss/codegen_on_oss/metrics.py", line 41, in start_profiler - yield profile - └ - - File "/home/codegen/codegen/codegen-on-oss/codegen_on_oss/parser.py", line 64, in parse - raise ParseRunError(validation_status) - │ └ - └ - -codegen_on_oss.parser.ParseRunError: LOW_IMPORT_RESOLUTION_RATE - 21:36:58 INFO { - "profile_name": "https://github.com/JohnSnowLabs/spark-nlp.git", - "step": "TOTAL", - "delta_time": 7.740976418000173, - "cumulative_time": 7.740976418000173, - "cpu_time": 180.9221699, - "memory_usage": 569249792, - "memory_delta": 0, - "error": "LOW_IMPORT_RESOLUTION_RATE" -} - 21:36:58 INFO Cloning repository https://github.com/Lightning-AI/lightning.git - 21:37:53 INFO { - "profile_name": "https://github.com/Lightning-AI/lightning.git", - "step": "codebase_init", - "delta_time": 24.256577352999557, - "cumulative_time": 24.256577352999557, - "cpu_time": 211.3604081, - "memory_usage": 1535971328, - "memory_delta": 966184960, - "error": null -} - 21:37:53 INFO { - "profile_name": "https://github.com/Lightning-AI/lightning.git", - "step": "post_init_validation", - "delta_time": 0.137609629000508, - "cumulative_time": 24.394186982000065, - "cpu_time": 211.5082702, - "memory_usage": 1536241664, - "memory_delta": 270336, - "error": null -} - 21:37:53 INFO { - "profile_name": "https://github.com/Lightning-AI/lightning.git", - "step": "TOTAL", - "delta_time": 24.394700584999555, - "cumulative_time": 24.394700584999555, - "cpu_time": 211.5088282, - "memory_usage": 1536241664, - "memory_delta": 0, - "error": null -} -``` - -## Example Metrics Output - -| profile_name | step | delta_time | cumulative_time | cpu_time | memory_usage | memory_delta | error | -| ---------------------- | -------------------- | ------------------ | ------------------ | ----------- | ------------ | ------------ | -------------------------- | -| JohnSnowLabs/spark-nlp | codebase_init | 7.186550649999845 | 7.186550649999845 | 180.3553702 | 567525376 | 317095936 | | -| JohnSnowLabs/spark-nlp | post_init_validation | 0.5465090990001045 | 7.733059748999949 | 180.9174761 | 569249792 | 1724416 | | -| JohnSnowLabs/spark-nlp | TOTAL | 7.740976418000173 | 7.740976418000173 | 180.9221699 | 569249792 | 0 | LOW_IMPORT_RESOLUTION_RATE | -| Lightning-AI/lightning | codebase_init | 24.256577352999557 | 24.256577352999557 | 211.3604081 | 1535971328 | 966184960 | | -| Lightning-AI/lightning | post_init_validation | 0.137609629000508 | 24.394186982000065 | 211.5082702 | 1536241664 | 270336 | | -| Lightning-AI/lightning | TOTAL | 24.394700584999555 | 24.394700584999555 | 211.5088282 | 1536241664 | 0 | | diff --git a/codegen-on-oss/codegen_on_oss/analysis/__init__.py b/codegen-on-oss/codegen_on_oss/analysis/__init__.py new file mode 100644 index 000000000..60d83758b --- /dev/null +++ b/codegen-on-oss/codegen_on_oss/analysis/__init__.py @@ -0,0 +1,7 @@ +""" +Analysis module for codegen-on-oss. +""" + +from codegen_on_oss.analysis.harness_integration import CodebaseAnalysisHarness + +__all__ = ["CodebaseAnalysisHarness"] diff --git a/codegen-on-oss/codegen_on_oss/analysis/analysis.py b/codegen-on-oss/codegen_on_oss/analysis/analysis.py index 9e956ec06..68c4aac0e 100644 --- a/codegen-on-oss/codegen_on_oss/analysis/analysis.py +++ b/codegen-on-oss/codegen_on_oss/analysis/analysis.py @@ -1,23 +1,26 @@ -from fastapi import FastAPI -from pydantic import BaseModel -from typing import Dict, List, Tuple, Any +import contextlib +import math +import os +import re +import shutil +import subprocess +import tempfile +from datetime import datetime, timedelta +from typing import Any + +import modal +import requests from codegen import Codebase +from codegen.sdk.core.expressions.binary_expression import BinaryExpression +from codegen.sdk.core.expressions.comparison_expression import ComparisonExpression +from codegen.sdk.core.expressions.unary_expression import UnaryExpression from codegen.sdk.core.statements.for_loop_statement import ForLoopStatement from codegen.sdk.core.statements.if_block_statement import IfBlockStatement from codegen.sdk.core.statements.try_catch_statement import TryCatchStatement from codegen.sdk.core.statements.while_statement import WhileStatement -from codegen.sdk.core.expressions.binary_expression import BinaryExpression -from codegen.sdk.core.expressions.unary_expression import UnaryExpression -from codegen.sdk.core.expressions.comparison_expression import ComparisonExpression -import math -import re -import requests -from datetime import datetime, timedelta -import subprocess -import os -import tempfile +from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware -import modal +from pydantic import BaseModel image = ( modal.Image.debian_slim() @@ -40,7 +43,12 @@ ) -def get_monthly_commits(repo_path: str) -> Dict[str, int]: +class GitExecutableNotFoundError(ValueError): + """Error raised when the git executable is not found in PATH.""" + pass + + +def get_monthly_commits(repo_path: str) -> dict[str, int]: """ Get the number of commits per month for the last 12 months. @@ -62,18 +70,34 @@ def get_monthly_commits(repo_path: str) -> Dict[str, int]: original_dir = os.getcwd() with tempfile.TemporaryDirectory() as temp_dir: - subprocess.run(["git", "clone", repo_path, temp_dir], check=True) + # Use full path to git executable + git_executable = shutil.which("git") + if not git_executable: + raise GitExecutableNotFoundError() + + # Use subprocess with full path to git + subprocess.run( + [git_executable, "clone", repo_path, temp_dir], + check=True, + capture_output=True, + text=True + ) os.chdir(temp_dir) cmd = [ - "git", + git_executable, "log", f"--since={since_date}", f"--until={until_date}", "--format=%aI", ] - result = subprocess.run(cmd, capture_output=True, text=True, check=True) + result = subprocess.run( + cmd, + capture_output=True, + text=True, + check=True + ) commit_dates = result.stdout.strip().split("\n") monthly_counts = {} @@ -102,10 +126,8 @@ def get_monthly_commits(repo_path: str) -> Dict[str, int]: print(f"Error processing git commits: {e}") return {} finally: - try: + with contextlib.suppress(Exception): os.chdir(original_dir) - except: - pass def calculate_cyclomatic_complexity(function): @@ -117,7 +139,7 @@ def analyze_statement(statement): if hasattr(statement, "elif_statements"): complexity += len(statement.elif_statements) - elif isinstance(statement, (ForLoopStatement, WhileStatement)): + elif isinstance(statement, ForLoopStatement | WhileStatement): complexity += 1 elif isinstance(statement, TryCatchStatement): @@ -255,10 +277,7 @@ def count_lines(source: str): comments += 1 if line.strip().startswith('"""') or line.strip().startswith("'''"): code_part = "" - elif in_multiline: - comments += 1 - code_part = "" - elif line.strip().startswith("#"): + elif in_multiline or line.strip().startswith("#"): comments += 1 code_part = "" @@ -334,7 +353,7 @@ class RepoRequest(BaseModel): @fastapi_app.post("/analyze_repo") -async def analyze_repo(request: RepoRequest) -> Dict[str, Any]: +async def analyze_repo(request: RepoRequest) -> dict[str, Any]: """Analyze a repository and return comprehensive metrics.""" repo_url = request.repo_url codebase = Codebase.from_repo(repo_url) diff --git a/codegen-on-oss/codegen_on_oss/analysis/analysis_import.py b/codegen-on-oss/codegen_on_oss/analysis/analysis_import.py index 8166e5d31..f7a4e5ea5 100644 --- a/codegen-on-oss/codegen_on_oss/analysis/analysis_import.py +++ b/codegen-on-oss/codegen_on_oss/analysis/analysis_import.py @@ -1,11 +1,11 @@ import logging import modal -from codegen import CodegenApp, Codebase +import networkx as nx +from codegen import Codebase, CodegenApp from codegen.extensions.github.types.events.pull_request import PullRequestLabeledEvent from codegen.extensions.tools.github.create_pr_comment import create_pr_comment from dotenv import load_dotenv -import networkx as nx load_dotenv() @@ -47,7 +47,9 @@ def find_import_cycles(G): for i, cycle in enumerate(cycles, 1): print(f"\nCycle #{i}: Size {len(cycle)} files") - print(f"Total number of imports in cycle: {G.subgraph(cycle).number_of_edges()}") + print( + f"Total number of imports in cycle: {G.subgraph(cycle).number_of_edges()}" + ) print("\nFiles in this cycle:") for file in cycle: @@ -69,8 +71,12 @@ def find_problematic_import_loops(G, cycles): for to_file in scc: if G.has_edge(from_file, to_file): edges = G.get_edge_data(from_file, to_file) - dynamic_count = sum(1 for e in edges.values() if e["color"] == "red") - static_count = sum(1 for e in edges.values() if e["color"] == "black") + dynamic_count = sum( + 1 for e in edges.values() if e["color"] == "red" + ) + static_count = sum( + 1 for e in edges.values() if e["color"] == "black" + ) if dynamic_count > 0 and static_count > 0: mixed_imports[(from_file, to_file)] = { @@ -80,12 +86,20 @@ def find_problematic_import_loops(G, cycles): } if mixed_imports: - problematic_cycles.append({"files": scc, "mixed_imports": mixed_imports, "index": i}) - - print(f"Found {len(problematic_cycles)} cycles with potentially problematic imports.") + problematic_cycles.append({ + "files": scc, + "mixed_imports": mixed_imports, + "index": i, + }) + + print( + f"Found {len(problematic_cycles)} cycles with potentially problematic imports." + ) for i, cycle in enumerate(problematic_cycles): - print(f"\n⚠️ Problematic Cycle #{i + 1} (Index {cycle['index']}): Size {len(cycle['files'])} files") + print( + f"\n⚠️ Problematic Cycle #{i + 1} (Index {cycle['index']}): Size {len(cycle['files'])} files" + ) print("\nFiles in cycle:") for file in cycle["files"]: print(f" - {file}") @@ -101,7 +115,9 @@ def find_problematic_import_loops(G, cycles): @cg.github.event("pull_request:labeled") def handle_pr(event: PullRequestLabeledEvent): - codebase = Codebase.from_repo(event.repository.get("full_name"), commit=event.pull_request.head.sha) + codebase = Codebase.from_repo( + event.repository.get("full_name"), commit=event.pull_request.head.sha + ) G = create_graph_from_codebase(event.repository.get("full_name")) cycles = find_import_cycles(G) @@ -112,7 +128,9 @@ def handle_pr(event: PullRequestLabeledEvent): if problematic_loops: message.append("\n### ⚠️ Potentially Problematic Import Cycles") - message.append("Cycles with mixed static and dynamic imports, which might recquire attention.") + message.append( + "Cycles with mixed static and dynamic imports, which might recquire attention." + ) for i, cycle in enumerate(problematic_loops, 1): message.append(f"\n#### Problematic Cycle {i}") for (from_file, to_file), imports in cycle["mixed_imports"].items(): diff --git a/codegen-on-oss/codegen_on_oss/analysis/codebase_context.py b/codegen-on-oss/codegen_on_oss/analysis/codebase_context.py index 5c0fd47dd..4e6056aff 100644 --- a/codegen-on-oss/codegen_on_oss/analysis/codebase_context.py +++ b/codegen-on-oss/codegen_on_oss/analysis/codebase_context.py @@ -9,12 +9,13 @@ from pathlib import Path from typing import TYPE_CHECKING, Any -from rustworkx import PyDiGraph, WeightedEdgeList - from codegen.configs.models.codebase import CodebaseConfig, PinkMode from codegen.configs.models.secrets import SecretsConfig from codegen.sdk.codebase.config import ProjectConfig, SessionOptions -from codegen.sdk.codebase.config_parser import ConfigParser, get_config_parser_for_language +from codegen.sdk.codebase.config_parser import ( + ConfigParser, + get_config_parser_for_language, +) from codegen.sdk.codebase.diff_lite import ChangeType, DiffLite from codegen.sdk.codebase.flagging.flags import Flags from codegen.sdk.codebase.io.file_io import FileIO @@ -23,8 +24,14 @@ from codegen.sdk.codebase.validation import get_edges, post_reset_validation from codegen.sdk.core.autocommit import AutoCommit, commiter from codegen.sdk.core.directory import Directory -from codegen.sdk.core.external.dependency_manager import DependencyManager, get_dependency_manager -from codegen.sdk.core.external.language_engine import LanguageEngine, get_language_engine +from codegen.sdk.core.external.dependency_manager import ( + DependencyManager, + get_dependency_manager, +) +from codegen.sdk.core.external.language_engine import ( + LanguageEngine, + get_language_engine, +) from codegen.sdk.enums import Edge, EdgeType, NodeType from codegen.sdk.extensions.sort import sort_editables from codegen.sdk.extensions.utils import uncache_all @@ -33,13 +40,11 @@ from codegen.shared.exceptions.control_flow import StopCodemodException from codegen.shared.logging.get_logger import get_logger from codegen.shared.performance.stopwatch_utils import stopwatch, stopwatch_with_sentry +from rustworkx import PyDiGraph, WeightedEdgeList if TYPE_CHECKING: from collections.abc import Generator, Mapping, Sequence - from codeowners import CodeOwners as CodeOwnersParser - from git import Commit as GitCommit - from codegen.git.repo_operator.repo_operator import RepoOperator from codegen.sdk.codebase.io.io import IO from codegen.sdk.codebase.node_classes.node_classes import NodeClasses @@ -51,6 +56,8 @@ from codegen.sdk.core.interfaces.importable import Importable from codegen.sdk.core.node_id_factory import NodeId from codegen.sdk.core.parser import Parser + from codeowners import CodeOwners as CodeOwnersParser + from git import Commit as GitCommit logger = get_logger(__name__) @@ -89,7 +96,9 @@ def get_node_classes(programming_language: ProgrammingLanguage) -> NodeClasses: return TSNodeClasses else: - from codegen.sdk.codebase.node_classes.generic_node_classes import GenericNodeClasses + from codegen.sdk.codebase.node_classes.generic_node_classes import ( + GenericNodeClasses, + ) return GenericNodeClasses @@ -108,8 +117,12 @@ class CodebaseContext: # =====[ computed attributes ]===== transaction_manager: TransactionManager - pending_syncs: list[DiffLite] # Diffs that have been applied to disk, but not the graph (to be used for sync graph) - all_syncs: list[DiffLite] # All diffs that have been applied to the graph (to be used for graph reset) + pending_syncs: list[ + DiffLite + ] # Diffs that have been applied to disk, but not the graph (to be used for sync graph) + all_syncs: list[ + DiffLite + ] # All diffs that have been applied to the graph (to be used for graph reset) _autocommit: AutoCommit generation: int parser: Parser[Expression] @@ -165,7 +178,11 @@ def __init__( self.secrets = secrets or SecretsConfig() self.repo_name = context.repo_operator.repo_name self.repo_path = str(Path(context.repo_operator.repo_path).resolve()) - self.full_path = os.path.join(self.repo_path, context.base_path) if context.base_path else self.repo_path + self.full_path = ( + os.path.join(self.repo_path, context.base_path) + if context.base_path + else self.repo_path + ) self.codeowners_parser = context.repo_operator.codeowners_parser self.base_url = context.repo_operator.base_url if not self.config.allow_external: @@ -178,19 +195,30 @@ def __init__( self._autocommit = AutoCommit(self) self.init_nodes = None self.init_edges = None - self.directories = dict() - self.parser = Parser.from_node_classes(self.node_classes, log_parse_warnings=self.config.debug) + self.directories = {} + self.parser = Parser.from_node_classes( + self.node_classes, log_parse_warnings=self.config.debug + ) self.extensions = self.node_classes.file_cls.get_extensions() # ORDER IS IMPORTANT HERE! - self.config_parser = get_config_parser_for_language(context.programming_language, self) - self.dependency_manager = get_dependency_manager(context.programming_language, self) + self.config_parser = get_config_parser_for_language( + context.programming_language, self + ) + self.dependency_manager = get_dependency_manager( + context.programming_language, self + ) self.language_engine = get_language_engine(context.programming_language, self) self.programming_language = context.programming_language # Raise warning if language is not supported - if self.programming_language is ProgrammingLanguage.UNSUPPORTED or self.programming_language is ProgrammingLanguage.OTHER: + if ( + self.programming_language is ProgrammingLanguage.UNSUPPORTED + or self.programming_language is ProgrammingLanguage.OTHER + ): logger.warning("WARNING: The codebase is using an unsupported language!") - logger.warning("Some features may not work as expected. Advanced static analysis will be disabled but simple file IO will still work.") + logger.warning( + "Some features may not work as expected. Advanced static analysis will be disabled but simple file IO will still work." + ) # Assert config assertions # External import resolution must be enabled if syspath is enabled @@ -200,7 +228,10 @@ def __init__( raise ValueError(msg) # Build the graph - if not self.config.exp_lazy_graph and self.config.use_pink != PinkMode.ALL_FILES: + if ( + not self.config.exp_lazy_graph + and self.config.use_pink != PinkMode.ALL_FILES + ): self.build_graph(context.repo_operator) try: self.synced_commit = context.repo_operator.head_commit @@ -238,9 +269,15 @@ def build_graph(self, repo_operator: RepoOperator) -> None: if self.config.disable_file_parse: logger.warning("WARNING: File parsing is disabled!") else: - for filepath, _ in repo_operator.iter_files(subdirs=self.projects[0].subdirectories, extensions=self.extensions, ignore_list=GLOBAL_FILE_IGNORE_LIST): + for filepath, _ in repo_operator.iter_files( + subdirs=self.projects[0].subdirectories, + extensions=self.extensions, + ignore_list=GLOBAL_FILE_IGNORE_LIST, + ): syncs[SyncType.ADD].append(self.to_absolute(filepath)) - logger.info(f"> Parsing {len(syncs[SyncType.ADD])} files in {self.projects[0].subdirectories or 'ALL'} subdirectories with {self.extensions} extensions") + logger.info( + f"> Parsing {len(syncs[SyncType.ADD])} files in {self.projects[0].subdirectories or 'ALL'} subdirectories with {self.extensions} extensions" + ) self._process_diff_files(syncs, incremental=False) files: list[SourceFile] = self.get_nodes(NodeType.FILE) logger.info(f"> Found {len(files)} files") @@ -253,7 +290,9 @@ def build_graph(self, repo_operator: RepoOperator) -> None: def apply_diffs(self, diff_list: list[DiffLite]) -> None: """Applies the given set of diffs to the graph in order to match the current file system content""" if self.session_options: - self.session_options = self.session_options.model_copy(update={"max_seconds": None}) + self.session_options = self.session_options.model_copy( + update={"max_seconds": None} + ) logger.info(f"Applying {len(diff_list)} diffs to graph") files_to_sync: dict[Path, SyncType] = {} # Gather list of deleted files, new files to add, and modified files to reparse @@ -263,7 +302,10 @@ def apply_diffs(self, diff_list: list[DiffLite]) -> None: filepath = Path(diff.path) if extensions is not None and filepath.suffix not in extensions: continue - if self.projects[0].subdirectories is not None and not any(filepath.relative_to(subdir) for subdir in self.projects[0].subdirectories): + if self.projects[0].subdirectories is not None and not any( + filepath.relative_to(subdir) + for subdir in self.projects[0].subdirectories + ): continue if diff.change_type == ChangeType.Added: @@ -320,7 +362,9 @@ def _reset_files(self, syncs: list[DiffLite]) -> None: elif sync.change_type == ChangeType.Added: files_to_remove.append(sync.path) modified_files.add(sync.path) - logger.info(f"Writing {len(files_to_write)} files to disk and removing {len(files_to_remove)} files") + logger.info( + f"Writing {len(files_to_write)} files to disk and removing {len(files_to_remove)} files" + ) for file in files_to_remove: self.io.delete_file(file) to_save = set() @@ -341,7 +385,9 @@ def undo_applied_diffs(self) -> None: self.io.check_changes() self.pending_syncs.clear() # Discard pending changes if len(self.all_syncs) > 0: - logger.info(f"Unapplying {len(self.all_syncs)} diffs to graph. Current graph commit: {self.synced_commit}") + logger.info( + f"Unapplying {len(self.all_syncs)} diffs to graph. Current graph commit: {self.synced_commit}" + ) self._revert_diffs(list(reversed(self.all_syncs))) self.all_syncs.clear() @@ -349,13 +395,20 @@ def undo_applied_diffs(self) -> None: @commiter(reset=True) def _revert_diffs(self, diff_list: list[DiffLite]) -> None: """Resets the graph to its initial solve branch file state""" - reversed_diff_list = list(DiffLite.from_reverse_diff(diff) for diff in diff_list) + reversed_diff_list = [DiffLite.from_reverse_diff(diff) for diff in diff_list] self._autocommit.reset() self.apply_diffs(reversed_diff_list) # ====== [ Re-resolve lost edges from previous syncs ] ====== self.prune_graph() if self.config.verify_graph: - post_reset_validation(self.old_graph.nodes(), self._graph.nodes(), get_edges(self.old_graph), get_edges(self._graph), self.repo_name, self.projects[0].subdirectories) + post_reset_validation( + self.old_graph.nodes(), + self._graph.nodes(), + get_edges(self.old_graph), + get_edges(self._graph), + self.repo_name, + self.projects[0].subdirectories, + ) def save_commit(self, commit: GitCommit) -> None: if commit is not None: @@ -378,7 +431,7 @@ def prune_graph(self) -> None: def build_directory_tree(self) -> None: """Builds the directory tree for the codebase""" # Reset and rebuild the directory tree - self.directories = dict() + self.directories = {} for file_path, _ in self.projects[0].repo_operator.iter_files( subdirs=self.projects[0].subdirectories, @@ -389,7 +442,12 @@ def build_directory_tree(self) -> None: directory = self.get_directory(file_path.parent, create_on_missing=True) directory._add_file(file_path.name) - def get_directory(self, directory_path: PathLike, create_on_missing: bool = False, ignore_case: bool = False) -> Directory | None: + def get_directory( + self, + directory_path: PathLike, + create_on_missing: bool = False, + ignore_case: bool = False, + ) -> Directory | None: """Returns the directory object for the given path, or None if the directory does not exist. If create_on_missing is set, use a recursive strategy to create the directory object and all subdirectories. @@ -397,7 +455,9 @@ def get_directory(self, directory_path: PathLike, create_on_missing: bool = Fals # If not part of repo path, return None absolute_path = self.to_absolute(directory_path) if not self.is_subdir(absolute_path) and not self.config.allow_external: - assert False, f"Directory {absolute_path} is not part of repo path {self.repo_path}" + raise AssertionError( + f"Directory {absolute_path} is not part of repo path {self.repo_path}" + ) return None # Get the directory @@ -414,7 +474,9 @@ def get_directory(self, directory_path: PathLike, create_on_missing: bool = Fals parent_path = absolute_path.parent # Base Case - if str(absolute_path) == str(self.repo_path) or str(absolute_path) == str(parent_path): + if str(absolute_path) == str(self.repo_path) or str(absolute_path) == str( + parent_path + ): root_directory = Directory(ctx=self, path=absolute_path, dirpath="") self.directories[absolute_path] = root_directory return root_directory @@ -422,7 +484,11 @@ def get_directory(self, directory_path: PathLike, create_on_missing: bool = Fals # Recursively create the parent directory parent = self.get_directory(parent_path, create_on_missing=True) # Create the directory - directory = Directory(ctx=self, path=absolute_path, dirpath=str(self.to_relative(absolute_path))) + directory = Directory( + ctx=self, + path=absolute_path, + dirpath=str(self.to_relative(absolute_path)), + ) # Add the directory to the parent parent._add_subdirectory(directory.name) # Add the directory to the tree @@ -430,17 +496,25 @@ def get_directory(self, directory_path: PathLike, create_on_missing: bool = Fals return directory return None - def _process_diff_files(self, files_to_sync: Mapping[SyncType, list[Path]], incremental: bool = True) -> None: + def _process_diff_files( + self, files_to_sync: Mapping[SyncType, list[Path]], incremental: bool = True + ) -> None: # If all the files are empty, don't uncache assert self._computing is False - skip_uncache = incremental and ((len(files_to_sync[SyncType.DELETE]) + len(files_to_sync[SyncType.REPARSE])) == 0) + skip_uncache = incremental and ( + (len(files_to_sync[SyncType.DELETE]) + len(files_to_sync[SyncType.REPARSE])) + == 0 + ) if not skip_uncache: uncache_all() # Step 0: Start the dependency manager and language engine if they exist # Start the dependency manager. This may or may not run asynchronously, depending on the implementation if self.dependency_manager is not None: # Check if its inital start or a reparse - if not self.dependency_manager.ready() and not self.dependency_manager.error(): + if ( + not self.dependency_manager.ready() + and not self.dependency_manager.error() + ): # TODO: We do not reparse dependencies during syncs as it is expensive. We should probably add a flag for this logger.info("> Starting dependency manager") self.dependency_manager.start(async_start=False) @@ -457,9 +531,13 @@ def _process_diff_files(self, files_to_sync: Mapping[SyncType, list[Path]], incr # Step 1: Wait for dependency manager and language engines to finish before graph construction if self.dependency_manager is not None: - self.dependency_manager.wait_until_ready(ignore_error=self.config.ignore_process_errors) + self.dependency_manager.wait_until_ready( + ignore_error=self.config.ignore_process_errors + ) if self.language_engine is not None: - self.language_engine.wait_until_ready(ignore_error=self.config.ignore_process_errors) + self.language_engine.wait_until_ready( + ignore_error=self.config.ignore_process_errors + ) # ====== [ Refresh the graph] ======== # Step 2: For any files that no longer exist, remove them during the sync @@ -468,19 +546,29 @@ def _process_diff_files(self, files_to_sync: Mapping[SyncType, list[Path]], incr for file_path in files_to_sync[SyncType.ADD]: if not self.io.file_exists(self.to_absolute(file_path)): add_to_remove.append(file_path) - logger.warning(f"SYNC: SourceFile {file_path} no longer exists! Removing from graph") + logger.warning( + f"SYNC: SourceFile {file_path} no longer exists! Removing from graph" + ) reparse_to_remove = [] for file_path in files_to_sync[SyncType.REPARSE]: if not self.io.file_exists(self.to_absolute(file_path)): reparse_to_remove.append(file_path) - logger.warning(f"SYNC: SourceFile {file_path} no longer exists! Removing from graph") - files_to_sync[SyncType.ADD] = [f for f in files_to_sync[SyncType.ADD] if f not in add_to_remove] - files_to_sync[SyncType.REPARSE] = [f for f in files_to_sync[SyncType.REPARSE] if f not in reparse_to_remove] + logger.warning( + f"SYNC: SourceFile {file_path} no longer exists! Removing from graph" + ) + files_to_sync[SyncType.ADD] = [ + f for f in files_to_sync[SyncType.ADD] if f not in add_to_remove + ] + files_to_sync[SyncType.REPARSE] = [ + f for f in files_to_sync[SyncType.REPARSE] if f not in reparse_to_remove + ] for file_path in add_to_remove + reparse_to_remove: if self.get_file(file_path) is not None: files_to_sync[SyncType.DELETE].append(file_path) else: - logger.warning(f"SYNC: SourceFile {file_path} does not exist and also not found on graph!") + logger.warning( + f"SYNC: SourceFile {file_path} does not exist and also not found on graph!" + ) # Step 3: Remove files to delete from graph to_resolve = [] @@ -488,35 +576,53 @@ def _process_diff_files(self, files_to_sync: Mapping[SyncType, list[Path]], incr file = self.get_file(file_path) file.remove_internal_edges() to_resolve.extend(file.unparse()) - to_resolve = list(filter(lambda node: self.has_node(node.node_id) and node is not None, to_resolve)) + to_resolve = list( + filter( + lambda node: self.has_node(node.node_id) and node is not None, + to_resolve, + ) + ) for file_path in files_to_sync[SyncType.REPARSE]: file = self.get_file(file_path) file.remove_internal_edges() - task = self.progress.begin("Reparsing updated files", count=len(files_to_sync[SyncType.REPARSE])) + task = self.progress.begin( + "Reparsing updated files", count=len(files_to_sync[SyncType.REPARSE]) + ) files_to_resolve = [] # Step 4: Reparse updated files for idx, file_path in enumerate(files_to_sync[SyncType.REPARSE]): task.update(f"Reparsing {self.to_relative(file_path)}", count=idx) file = self.get_file(file_path) to_resolve.extend(file.unparse(reparse=True)) - to_resolve = list(filter(lambda node: self.has_node(node.node_id) and node is not None, to_resolve)) + to_resolve = list( + filter( + lambda node: self.has_node(node.node_id) and node is not None, + to_resolve, + ) + ) file.sync_with_file_content() files_to_resolve.append(file) task.end() # Step 5: Add new files as nodes to graph (does not yet add edges) - task = self.progress.begin("Adding new files", count=len(files_to_sync[SyncType.ADD])) + task = self.progress.begin( + "Adding new files", count=len(files_to_sync[SyncType.ADD]) + ) for idx, filepath in enumerate(files_to_sync[SyncType.ADD]): task.update(f"Adding {self.to_relative(filepath)}", count=idx) try: content = self.io.read_text(filepath) - except UnicodeDecodeError as e: - logger.warning(f"Can't read file at:{filepath} since it contains non-unicode characters. File will be ignored!") + except UnicodeDecodeError: + logger.warning( + f"Can't read file at:{filepath} since it contains non-unicode characters. File will be ignored!" + ) continue # TODO: this is wrong with context changes if filepath.suffix in self.extensions: file_cls = self.node_classes.file_cls - new_file = file_cls.from_content(filepath, content, self, sync=False, verify_syntax=False) + new_file = file_cls.from_content( + filepath, content, self, sync=False, verify_syntax=False + ) if new_file is not None: files_to_resolve.append(new_file) task.end() @@ -524,7 +630,12 @@ def _process_diff_files(self, files_to_sync: Mapping[SyncType, list[Path]], incr to_resolve.append(file) to_resolve.extend(file.get_nodes()) - to_resolve = list(filter(lambda node: self.has_node(node.node_id) and node is not None, to_resolve)) + to_resolve = list( + filter( + lambda node: self.has_node(node.node_id) and node is not None, + to_resolve, + ) + ) counter = Counter(node.node_type for node in to_resolve) # Step 6: Build directory tree @@ -540,13 +651,19 @@ def _process_diff_files(self, files_to_sync: Mapping[SyncType, list[Path]], incr uncache_all() if self.config.disable_graph: - logger.warning("Graph generation is disabled. Skipping import and symbol resolution") + logger.warning( + "Graph generation is disabled. Skipping import and symbol resolution" + ) self._computing = False else: self._computing = True try: - logger.info(f"> Computing import resolution edges for {counter[NodeType.IMPORT]} imports") - task = self.progress.begin("Resolving imports", count=counter[NodeType.IMPORT]) + logger.info( + f"> Computing import resolution edges for {counter[NodeType.IMPORT]} imports" + ) + task = self.progress.begin( + "Resolving imports", count=counter[NodeType.IMPORT] + ) for node in to_resolve: if node.node_type == NodeType.IMPORT: task.update(f"Resolving imports in {node.filepath}", count=idx) @@ -555,11 +672,18 @@ def _process_diff_files(self, files_to_sync: Mapping[SyncType, list[Path]], incr to_resolve.extend(node.symbol_usages) task.end() if counter[NodeType.EXPORT] > 0: - logger.info(f"> Computing export dependencies for {counter[NodeType.EXPORT]} exports") - task = self.progress.begin("Computing export dependencies", count=counter[NodeType.EXPORT]) + logger.info( + f"> Computing export dependencies for {counter[NodeType.EXPORT]} exports" + ) + task = self.progress.begin( + "Computing export dependencies", count=counter[NodeType.EXPORT] + ) for node in to_resolve: if node.node_type == NodeType.EXPORT: - task.update(f"Computing export dependencies for {node.filepath}", count=idx) + task.update( + f"Computing export dependencies for {node.filepath}", + count=idx, + ) node._remove_internal_edges(EdgeType.EXPORT) node.compute_export_dependencies() to_resolve.extend(node.symbol_usages) @@ -568,10 +692,16 @@ def _process_diff_files(self, files_to_sync: Mapping[SyncType, list[Path]], incr from codegen.sdk.core.interfaces.inherits import Inherits logger.info("> Computing superclass dependencies") - task = self.progress.begin("Computing superclass dependencies", count=counter[NodeType.SYMBOL]) + task = self.progress.begin( + "Computing superclass dependencies", + count=counter[NodeType.SYMBOL], + ) for symbol in to_resolve: if isinstance(symbol, Inherits): - task.update(f"Computing superclass dependencies for {symbol.filepath}", count=idx) + task.update( + f"Computing superclass dependencies for {symbol.filepath}", + count=idx, + ) symbol._remove_internal_edges(EdgeType.SUBCLASS) symbol.compute_superclass_dependencies() task.end() @@ -610,24 +740,41 @@ def build_subgraph(self, nodes: list[NodeId]) -> PyDiGraph[Importable, Edge]: def get_node(self, node_id: int) -> Any: return self._graph.get_node_data(node_id) - def get_nodes(self, node_type: NodeType | None = None, exclude_type: NodeType | None = None) -> list[Importable]: + def get_nodes( + self, node_type: NodeType | None = None, exclude_type: NodeType | None = None + ) -> list[Importable]: if node_type is not None and exclude_type is not None: msg = "node_type and exclude_type cannot both be specified" raise ValueError(msg) if node_type is not None: - return [self.get_node(node_id) for node_id in self._graph.filter_nodes(lambda node: node.node_type == node_type)] + return [ + self.get_node(node_id) + for node_id in self._graph.filter_nodes( + lambda node: node.node_type == node_type + ) + ] if exclude_type is not None: - return [self.get_node(node_id) for node_id in self._graph.filter_nodes(lambda node: node.node_type != node_type)] + return [ + self.get_node(node_id) + for node_id in self._graph.filter_nodes( + lambda node: node.node_type != node_type + ) + ] return self._graph.nodes() def get_edges(self) -> list[tuple[NodeId, NodeId, EdgeType, Usage | None]]: - return [(x[0], x[1], x[2].type, x[2].usage) for x in self._graph.weighted_edge_list()] - - def get_file(self, file_path: os.PathLike, ignore_case: bool = False) -> SourceFile | None: + return [ + (x[0], x[1], x[2].type, x[2].usage) + for x in self._graph.weighted_edge_list() + ] + + def get_file( + self, file_path: os.PathLike, ignore_case: bool = False + ) -> SourceFile | None: # If not part of repo path, return None absolute_path = self.to_absolute(file_path) if not self.is_subdir(absolute_path) and not self.config.allow_external: - assert False, f"File {file_path} is not part of the repository path" + raise AssertionError(f"File {file_path} is not part of the repository path") # Check if file exists in graph node_id = self.filepath_idx.get(str(self.to_relative(file_path)), None) @@ -635,7 +782,9 @@ def get_file(self, file_path: os.PathLike, ignore_case: bool = False) -> SourceF return self.get_node(node_id) if ignore_case: # Using `get_directory` so that the case insensitive lookup works - parent = self.get_directory(self.to_absolute(file_path).parent, ignore_case=ignore_case).path + parent = self.get_directory( + self.to_absolute(file_path).parent, ignore_case=ignore_case + ).path for file in parent.iterdir(): if str(file_path).lower() == str(self.to_relative(file)).lower(): return self.get_file(file, ignore_case=False) @@ -647,38 +796,58 @@ def _get_raw_file_from_path(self, path: Path) -> File | None: return File.from_content(path, self.io.read_text(path), self, sync=False) except UnicodeDecodeError: # Handle when file is a binary file - return File.from_content(path, self.io.read_bytes(path), self, sync=False, binary=True) + return File.from_content( + path, self.io.read_bytes(path), self, sync=False, binary=True + ) - def get_external_module(self, module: str, import_name: str) -> ExternalModule | None: + def get_external_module( + self, module: str, import_name: str + ) -> ExternalModule | None: node_id = self._ext_module_idx.get(module + "::" + import_name, None) if node_id is not None: return self.get_node(node_id) def add_node(self, node: Importable) -> int: - if self.config.debug: - if self._graph.find_node_by_weight(node.__eq__): - msg = "Node already exists" - raise Exception(msg) - if self.config.debug and self._computing and node.node_type != NodeType.EXTERNAL: - assert False, f"Adding node during compute dependencies: {node!r}" + if self.config.debug and self._graph.find_node_by_weight(node.__eq__): + msg = "Node already exists" + raise Exception(msg) + if ( + self.config.debug + and self._computing + and node.node_type != NodeType.EXTERNAL + ): + raise AssertionError(f"Adding node during compute dependencies: {node!r}") return self._graph.add_node(node) - def add_child(self, parent: NodeId, node: Importable, type: EdgeType, usage: Usage | None = None) -> int: - if self.config.debug: - if self._graph.find_node_by_weight(node.__eq__): - msg = "Node already exists" - raise Exception(msg) - if self.config.debug and self._computing and node.node_type != NodeType.EXTERNAL: - assert False, f"Adding node during compute dependencies: {node!r}" + def add_child( + self, + parent: NodeId, + node: Importable, + type: EdgeType, + usage: Usage | None = None, + ) -> int: + if self.config.debug and self._graph.find_node_by_weight(node.__eq__): + msg = "Node already exists" + raise Exception(msg) + if ( + self.config.debug + and self._computing + and node.node_type != NodeType.EXTERNAL + ): + raise AssertionError(f"Adding node during compute dependencies: {node!r}") return self._graph.add_child(parent, node, Edge(type, usage)) def has_node(self, node_id: NodeId): return isinstance(node_id, int) and self._graph.has_node(node_id) def has_edge(self, u: NodeId, v: NodeId, edge: Edge): - return self._graph.has_edge(u, v) and edge in self._graph.get_all_edge_data(u, v) + return self._graph.has_edge(u, v) and edge in self._graph.get_all_edge_data( + u, v + ) - def add_edge(self, u: NodeId, v: NodeId, type: EdgeType, usage: Usage | None = None) -> None: + def add_edge( + self, u: NodeId, v: NodeId, type: EdgeType, usage: Usage | None = None + ) -> None: edge = Edge(type, usage) if self.config.debug: assert self._graph.has_node(u) @@ -691,7 +860,11 @@ def add_edges(self, edges: list[tuple[NodeId, NodeId, Edge]]) -> None: for u, v, edge in edges: assert self._graph.has_node(u) assert self._graph.has_node(v), v - assert not self.has_edge(u, v, edge), (self.get_node(u), self.get_node(v), edge) + assert not self.has_edge(u, v, edge), ( + self.get_node(u), + self.get_node(v), + edge, + ) self._graph.add_edges_from(edges) @property @@ -703,16 +876,29 @@ def edges(self) -> WeightedEdgeList[Edge]: return self._graph.weighted_edge_list() def predecessor(self, n: NodeId, *, edge_type: EdgeType | None) -> Importable: - return self._graph.find_predecessor_node_by_edge(n, lambda edge: edge.type == edge_type) + return self._graph.find_predecessor_node_by_edge( + n, lambda edge: edge.type == edge_type + ) - def predecessors(self, n: NodeId, edge_type: EdgeType | None = None) -> Sequence[Importable]: + def predecessors( + self, n: NodeId, edge_type: EdgeType | None = None + ) -> Sequence[Importable]: if edge_type is not None: - return sort_editables(self._graph.find_predecessors_by_edge(n, lambda edge: edge.type == edge_type), by_id=True) + return sort_editables( + self._graph.find_predecessors_by_edge( + n, lambda edge: edge.type == edge_type + ), + by_id=True, + ) return self._graph.predecessors(n) - def successors(self, n: NodeId, *, edge_type: EdgeType | None = None, sort: bool = True) -> Sequence[Importable]: + def successors( + self, n: NodeId, *, edge_type: EdgeType | None = None, sort: bool = True + ) -> Sequence[Importable]: if edge_type is not None: - res = self._graph.find_successors_by_edge(n, lambda edge: edge.type == edge_type) + res = self._graph.find_successors_by_edge( + n, lambda edge: edge.type == edge_type + ) else: res = self._graph.successors(n) if sort: @@ -754,10 +940,19 @@ def to_relative(self, filepath: PathLike | str) -> Path: def is_subdir(self, path: PathLike | str) -> bool: path = self.to_absolute(path) - return path == Path(self.repo_path) or path.is_relative_to(self.repo_path) or Path(self.repo_path) in path.parents + return ( + path == Path(self.repo_path) + or path.is_relative_to(self.repo_path) + or Path(self.repo_path) in path.parents + ) @commiter - def commit_transactions(self, sync_graph: bool = True, sync_file: bool = True, files: set[Path] | None = None) -> None: + def commit_transactions( + self, + sync_graph: bool = True, + sync_file: bool = True, + files: set[Path] | None = None, + ) -> None: """Commits all transactions to the codebase, and syncs the graph to match the latest file changes. Should be called at the end of `execute` for every codemod group run. @@ -794,9 +989,16 @@ def add_single_file(self, filepath: PathLike) -> None: self.transaction_manager.check_limits() @contextmanager - def session(self, sync_graph: bool = True, commit: bool = True, session_options: SessionOptions = SessionOptions()) -> Generator[None, None, None]: + def session( + self, + sync_graph: bool = True, + commit: bool = True, + session_options: SessionOptions = SessionOptions(), + ) -> Generator[None, None, None]: self.session_options = session_options - self.transaction_manager.set_max_transactions(self.session_options.max_transactions) + self.transaction_manager.set_max_transactions( + self.session_options.max_transactions + ) self.transaction_manager.reset_stopwatch(self.session_options.max_seconds) try: yield None @@ -807,7 +1009,9 @@ def session(self, sync_graph: bool = True, commit: bool = True, session_options: if commit: self.commit_transactions(sync_graph) - def remove_directory(self, directory_path: PathLike, force: bool = False, cleanup: bool = True) -> None: + def remove_directory( + self, directory_path: PathLike, force: bool = False, cleanup: bool = True + ) -> None: """Removes a directory from the graph""" # Get the directory directory = self.get_directory(directory_path) @@ -840,6 +1044,8 @@ def remove_directory(self, directory_path: PathLike, force: bool = False, cleanu @property def ts_declassify(self) -> TSDeclassify: if self._ts_declassify is None: - self._ts_declassify = TSDeclassify(self.repo_path, self.projects[0].base_path) + self._ts_declassify = TSDeclassify( + self.repo_path, self.projects[0].base_path + ) self._ts_declassify.start() # Install react-declassify return self._ts_declassify diff --git a/codegen-on-oss/codegen_on_oss/analysis/document_functions.py b/codegen-on-oss/codegen_on_oss/analysis/document_functions.py index 3cc991218..c8b8d3b2f 100644 --- a/codegen-on-oss/codegen_on_oss/analysis/document_functions.py +++ b/codegen-on-oss/codegen_on_oss/analysis/document_functions.py @@ -12,7 +12,9 @@ def hop_through_imports(imp: Import) -> Symbol | ExternalModule: return imp.imported_symbol -def get_extended_context(symbol: Symbol, degree: int) -> tuple[set[Symbol], set[Symbol]]: +def get_extended_context( + symbol: Symbol, degree: int +) -> tuple[set[Symbol], set[Symbol]]: """Recursively collect dependencies and usages up to the specified degree. Args: @@ -47,7 +49,9 @@ def get_extended_context(symbol: Symbol, degree: int) -> tuple[set[Symbol], set[ if isinstance(usage_symbol, Symbol) and usage_symbol not in usages: usages.add(usage_symbol) - usage_deps, usage_usages = get_extended_context(usage_symbol, degree - 1) + usage_deps, usage_usages = get_extended_context( + usage_symbol, degree - 1 + ) dependencies.update(usage_deps) usages.update(usage_usages) @@ -60,23 +64,34 @@ def run(codebase: Codebase): N_DEGREE = 2 # Filter out test and tutorial functions first - functions = [f for f in codebase.functions if not any(pattern in f.name.lower() for pattern in ["test", "tutorial"]) and not any(pattern in f.filepath.lower() for pattern in ["test", "tutorial"])] + functions = [ + f + for f in codebase.functions + if not any(pattern in f.name.lower() for pattern in ["test", "tutorial"]) + and not any(pattern in f.filepath.lower() for pattern in ["test", "tutorial"]) + ] # Track progress for user feedback total_functions = len(functions) processed = 0 - print(f"Found {total_functions} functions to process (excluding tests and tutorials)") + print( + f"Found {total_functions} functions to process (excluding tests and tutorials)" + ) for function in functions: processed += 1 # Skip if already has docstring if function.docstring: - print(f"[{processed}/{total_functions}] Skipping {function.name} - already has docstring") + print( + f"[{processed}/{total_functions}] Skipping {function.name} - already has docstring" + ) continue - print(f"[{processed}/{total_functions}] Generating docstring for {function.name} at {function.filepath}") + print( + f"[{processed}/{total_functions}] Generating docstring for {function.name} at {function.filepath}" + ) # Collect context using N-degree dependencies and usages dependencies, usages = get_extended_context(function, N_DEGREE) @@ -113,7 +128,9 @@ def run(codebase: Codebase): if __name__ == "__main__": print("Parsing codebase...") - codebase = Codebase.from_repo("fastapi/fastapi", commit="887270ff8a54bb58c406b0651678a27589793d2f") + codebase = Codebase.from_repo( + "fastapi/fastapi", commit="887270ff8a54bb58c406b0651678a27589793d2f" + ) print("Running function...") run(codebase) diff --git a/codegen-on-oss/codegen_on_oss/analysis/harness_integration.py b/codegen-on-oss/codegen_on_oss/analysis/harness_integration.py new file mode 100644 index 000000000..327363bb8 --- /dev/null +++ b/codegen-on-oss/codegen_on_oss/analysis/harness_integration.py @@ -0,0 +1,207 @@ +""" +CodebaseAnalysisHarness - Integration of harness.py functionality for comprehensive codebase analysis. +""" + +import json +from pathlib import Path +from typing import Any + +from codegen import Codebase +from codegen.agents.code_agent import CodeAgent +from codegen.configs.models.codebase import CodebaseConfig +from loguru import logger + + +class CodebaseAnalysisHarness: + """ + A comprehensive harness for analyzing codebases, generating diffs, and tracking file changes. + Integrates core functionality from the original harness.py with enhanced analysis capabilities. + """ + + def __init__( + self, + codebase: Codebase, + base_commit: str | None = None, + metadata: dict[str, Any] | None = None, + ): + """ + Initialize the CodebaseAnalysisHarness with a codebase. + + Args: + codebase: The Codebase object to analyze + base_commit: Optional base commit to compare against + metadata: Optional metadata to associate with the analysis + """ + self.codebase = codebase + self.base_commit = base_commit + self.metadata = metadata or {} + self.analysis_results: dict[str, Any] = {} + + @classmethod + def from_repo( + cls, + repo_full_name: str, + commit: str | None = None, + language: str = "python", + disable_file_parse: bool = False, + ) -> "CodebaseAnalysisHarness": + """ + Create a CodebaseAnalysisHarness from a repository. + + Args: + repo_full_name: The full name of the repository (e.g., "owner/repo") + commit: Optional commit hash to checkout + language: The primary language of the repository + disable_file_parse: Whether to disable file parsing + + Returns: + A new CodebaseAnalysisHarness instance + """ + config = CodebaseConfig( + disable_file_parse=disable_file_parse, + ) + codebase = Codebase.from_repo( + repo_full_name=repo_full_name, + commit=commit, + language=language, + config=config, + ) + return cls(codebase=codebase, base_commit=commit) + + def analyze_codebase(self) -> dict[str, Any]: + """ + Perform comprehensive analysis of the codebase. + + Returns: + A dictionary containing analysis results + """ + logger.info(f"Analyzing codebase: {self.codebase.repo_name}") + + # Collect basic repository information + repo_info = { + "repo_name": self.codebase.repo_name, + "language": self.codebase.language, + "base_commit": self.base_commit, + } + + # Get file statistics + file_stats = self._get_file_statistics() + + # Combine all results + self.analysis_results = { + "repo_info": repo_info, + "file_stats": file_stats, + "metadata": self.metadata, + } + + return self.analysis_results + + def _get_file_statistics(self) -> dict[str, Any]: + """ + Get statistics about files in the codebase. + + Returns: + A dictionary containing file statistics + """ + file_count = len(self.codebase.files) + file_extensions = {} + + for file in self.codebase.files: + ext = Path(file.path).suffix + if ext in file_extensions: + file_extensions[ext] += 1 + else: + file_extensions[ext] = 1 + + return { + "file_count": file_count, + "file_extensions": file_extensions, + } + + def get_diff(self, base: str | None = None) -> str: + """ + Get the diff between the current state and a base commit. + + Args: + base: The base commit to compare against (defaults to self.base_commit) + + Returns: + A string containing the diff + """ + base_commit = base or self.base_commit + if not base_commit: + logger.warning("No base commit specified for diff generation") + return "" + + return self.codebase.get_diff(base=base_commit) + + def files_in_patch(self, patch: str) -> list[str]: + """ + Extract the list of modified files from a unified diff patch string. + + Args: + patch: A unified diff patch string + + Returns: + A list of modified file paths + """ + files: set[str] = set() + for line in patch.split("\n"): + if line.startswith("--- a/") or line.startswith("+++ b/"): + fname = line.split("/", 1)[1] + files.add(fname) + + return list(files) + + def run_agent(self, prompt: str, model: str | None = None) -> dict[str, Any]: + """ + Run a CodeAgent on the codebase with the given prompt. + + Args: + prompt: The prompt to send to the agent + model: Optional model to use for the agent + + Returns: + The result of the agent run + """ + tags = [str(value) for value in self.metadata.values() if value] + agent = CodeAgent(codebase=self.codebase, tags=tags, metadata=self.metadata) + + try: + result = agent.run(prompt=prompt) + + # Get the diff between the current state and the original commit + model_patch = self.get_diff(base=self.base_commit) + edited_files = self.files_in_patch(model_patch) + + return { + "result": result, + "model_patch": model_patch, + "edited_files": edited_files, + } + except Exception as agent_error: + logger.error(f"Agent run failed with error: {agent_error}") + raise + + def save_analysis_results(self, output_path: str | Path) -> Path: + """ + Save the analysis results to a JSON file. + + Args: + output_path: The path to save the results to + + Returns: + The path where the results were saved + """ + if not self.analysis_results: + logger.warning("No analysis results to save. Run analyze_codebase() first.") + self.analyze_codebase() + + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + + with open(output_path, "w") as f: + json.dump(self.analysis_results, f, indent=2) + + logger.info(f"Analysis results saved to {output_path}") + return output_path diff --git a/codegen-on-oss/codegen_on_oss/analysis/module_dependencies.py b/codegen-on-oss/codegen_on_oss/analysis/module_dependencies.py index 83cbc667c..30985a170 100644 --- a/codegen-on-oss/codegen_on_oss/analysis/module_dependencies.py +++ b/codegen-on-oss/codegen_on_oss/analysis/module_dependencies.py @@ -21,7 +21,9 @@ def run(codebase: Codebase): # Check if the import statement is importing an app for imp in import_statement.imports: # Assuming app imports follow a specific naming convention or structure - if "app" in imp.name: # Adjust this condition based on your app naming convention + if ( + "app" in imp.name + ): # Adjust this condition based on your app naming convention G.add_edge(app, imp.import_statement.source) nodes_to_remove = [node for node, degree in G.degree() if degree == 1] @@ -34,5 +36,9 @@ def run(codebase: Codebase): if __name__ == "__main__": - codebase = Codebase.from_repo("getsentry/sentry", commit="fb0d53b2210cc896fc3e2cf32dae149ea8a8a45a", language="python") + codebase = Codebase.from_repo( + "getsentry/sentry", + commit="fb0d53b2210cc896fc3e2cf32dae149ea8a8a45a", + language="python", + ) run(codebase) diff --git a/codegen-on-oss/codegen_on_oss/analysis/symbolattr.py b/codegen-on-oss/codegen_on_oss/analysis/symbolattr.py index 12ce97283..3242fc250 100644 --- a/codegen-on-oss/codegen_on_oss/analysis/symbolattr.py +++ b/codegen-on-oss/codegen_on_oss/analysis/symbolattr.py @@ -30,7 +30,9 @@ def print_symbol_attribution(codebase): # Print attribution for top symbols count = 0 - for symbol, usage_count in symbols_with_usages[:10]: # Look at top 10 most used symbols + for symbol, usage_count in symbols_with_usages[ + :10 + ]: # Look at top 10 most used symbols count += 1 print(f"\n📊 Symbol #{count}: {symbol.name} ({type(symbol).__name__})") print(f" • File: {symbol.filepath}") @@ -43,7 +45,14 @@ def print_symbol_attribution(codebase): print(" • Last editor: Not available") if hasattr(symbol, "editor_history") and symbol.editor_history: - print(f" • Editor history: {', '.join(symbol.editor_history[:5])}" + (f" and {len(symbol.editor_history) - 5} more..." if len(symbol.editor_history) > 5 else "")) + print( + f" • Editor history: {', '.join(symbol.editor_history[:5])}" + + ( + f" and {len(symbol.editor_history) - 5} more..." + if len(symbol.editor_history) > 5 + else "" + ) + ) else: print(" • Editor history: Not available") @@ -64,7 +73,10 @@ def print_symbol_attribution(codebase): repo_config = RepoConfig.from_repo_path(repo_path) repo_operator = RepoOperator(repo_config=repo_config) - project = ProjectConfig.from_repo_operator(repo_operator=repo_operator, programming_language=ProgrammingLanguage.PYTHON) + project = ProjectConfig.from_repo_operator( + repo_operator=repo_operator, + programming_language=ProgrammingLanguage.PYTHON, + ) codebase = Codebase(projects=[project]) else: # Use from_repo method for a well-known repository @@ -75,7 +87,9 @@ def print_symbol_attribution(codebase): language="python", ) - print(f"Codebase loaded with {len(codebase.files)} files and {len(codebase.symbols)} symbols") + print( + f"Codebase loaded with {len(codebase.files)} files and {len(codebase.symbols)} symbols" + ) # First run the analysis to gather attribution data print("\n🔍 Running AI impact analysis...") @@ -85,7 +99,7 @@ def print_symbol_attribution(codebase): print_symbol_attribution(codebase) except Exception as e: - print(f"\n❌ Error: {str(e)}") + print(f"\n❌ Error: {e!s}") import traceback traceback.print_exc() diff --git a/codegen-on-oss/codegen_on_oss/cli.py b/codegen-on-oss/codegen_on_oss/cli.py index c1807d13e..c3c849cf2 100644 --- a/codegen-on-oss/codegen_on_oss/cli.py +++ b/codegen-on-oss/codegen_on_oss/cli.py @@ -2,6 +2,7 @@ from pathlib import Path import click +import uvicorn from loguru import logger from codegen_on_oss.cache import cachedir @@ -124,5 +125,56 @@ def run( parser.parse(repo_url, commit_hash) +@cli.command() +@click.option( + "--host", + type=str, + default="127.0.0.1", + help="Host to bind the server to", +) +@click.option( + "--port", + type=int, + default=8000, + help="Port to bind the server to", +) +@click.option( + "--reload", + is_flag=True, + help="Enable auto-reload for development", +) +@click.option( + "--log-level", + type=click.Choice(["debug", "info", "warning", "error", "critical"]), + default="info", + help="Log level", +) +def serve( + host: str = "127.0.0.1", + port: int = 8000, + reload: bool = False, + log_level: str = "info", +): + """ + Start the CodeContextRetrievalServer. + + This server provides a REST API for codebase analysis, context management, + and agent execution. + """ + logger.add(sys.stdout, level=log_level.upper()) + logger.info(f"Starting CodeContextRetrievalServer on {host}:{port}") + + # Import here to avoid circular imports + + # Start the server + uvicorn.run( + "codegen_on_oss.context_server:app", + host=host, + port=port, + reload=reload, + log_level=log_level, + ) + + if __name__ == "__main__": cli() diff --git a/codegen-on-oss/codegen_on_oss/context_server/__init__.py b/codegen-on-oss/codegen_on_oss/context_server/__init__.py new file mode 100644 index 000000000..f8c342a57 --- /dev/null +++ b/codegen-on-oss/codegen_on_oss/context_server/__init__.py @@ -0,0 +1,7 @@ +""" +Context server module for providing a REST API to access codebase analysis and context functionality. +""" + +from codegen_on_oss.context_server.server import app, create_app + +__all__ = ["app", "create_app"] diff --git a/codegen-on-oss/codegen_on_oss/context_server/server.py b/codegen-on-oss/codegen_on_oss/context_server/server.py new file mode 100644 index 000000000..fc1339848 --- /dev/null +++ b/codegen-on-oss/codegen_on_oss/context_server/server.py @@ -0,0 +1,258 @@ +""" +CodeContextRetrievalServer - FastAPI server for accessing codebase analysis and context functionality. +""" + +import json +from pathlib import Path +from typing import Any + +from fastapi import FastAPI, HTTPException, Query +from loguru import logger +from pydantic import BaseModel, Field + +from codegen_on_oss.analysis.harness_integration import CodebaseAnalysisHarness +from codegen_on_oss.bucket_store import BucketStore +from codegen_on_oss.snapshot.context_snapshot import CodebaseContextSnapshot + + +# Define API models +class RepositoryInfo(BaseModel): + repo_full_name: str = Field(..., description="Full repository name (owner/repo)") + commit: str | None = Field(None, description="Commit hash to analyze") + language: str = Field("python", description="Primary language of the repository") + + +class AgentRunRequest(BaseModel): + prompt: str = Field(..., description="Prompt to send to the agent") + model: str | None = Field(None, description="Model to use for the agent") + metadata: dict[str, Any] | None = Field( + None, description="Metadata for the agent run" + ) + + +class SnapshotInfo(BaseModel): + snapshot_id: str = Field(..., description="ID of the snapshot") + created_at: str = Field(..., description="Creation timestamp") + repo_info: dict[str, Any] = Field(..., description="Repository information") + + +# Create FastAPI app +app = FastAPI( + title="Code Context Retrieval Server", + description="API for codebase analysis, context management, and agent execution", + version="0.1.0", +) + +# Global storage for active harnesses and snapshots +active_harnesses: dict[str, CodebaseAnalysisHarness] = {} +bucket_store: BucketStore | None = None + + +@app.on_event("startup") +async def startup_event(): + """Initialize resources on server startup.""" + global bucket_store + try: + bucket_store = BucketStore() + logger.info("Initialized bucket store for remote storage") + except Exception as e: + logger.warning(f"Failed to initialize bucket store: {e}") + logger.info("Continuing without remote storage capabilities") + + +@app.get("/") +async def root(): + """Root endpoint with API information.""" + return { + "name": "Code Context Retrieval Server", + "version": "0.1.0", + "endpoints": [ + "/analyze/repository", + "/analyze/file_stats", + "/snapshot/create", + "/snapshot/list", + "/snapshot/load/{snapshot_id}", + "/agent/run", + ], + } + + +@app.post("/analyze/repository", response_model=dict[str, Any]) +async def analyze_repository(repo_info: RepositoryInfo): + """ + Analyze a repository and return the results. + + Creates a new CodebaseAnalysisHarness for the repository and performs analysis. + """ + harness_key = f"{repo_info.repo_full_name}:{repo_info.commit or 'latest'}" + + try: + # Create a new harness for the repository + harness = CodebaseAnalysisHarness.from_repo( + repo_full_name=repo_info.repo_full_name, + commit=repo_info.commit, + language=repo_info.language, + ) + + # Store the harness for later use + active_harnesses[harness_key] = harness + + # Perform analysis + results = harness.analyze_codebase() + except Exception as e: + logger.error(f"Repository analysis failed: {e}") + raise HTTPException(status_code=500, detail=str(e)) from e + else: + return { + "harness_key": harness_key, + "results": results, + } + + +@app.get("/analyze/file_stats", response_model=dict[str, Any]) +async def get_file_stats( + harness_key: str = Query(..., description="Key of the active harness"), +): + """ + Get file statistics for an analyzed repository. + """ + if harness_key not in active_harnesses: + raise HTTPException(status_code=404, detail=f"Harness {harness_key} not found") + + harness = active_harnesses[harness_key] + + if not harness.analysis_results: + # Run analysis if not already done + harness.analyze_codebase() + + return harness.analysis_results.get("file_stats", {}) + + +@app.post("/snapshot/create", response_model=dict[str, str]) +async def create_snapshot( + harness_key: str = Query(..., description="Key of the active harness"), + local_path: str | None = Query( + None, description="Optional local path to save the snapshot" + ), +): + """ + Create a snapshot of the current codebase state and analysis results. + """ + if harness_key not in active_harnesses: + raise HTTPException(status_code=404, detail=f"Harness {harness_key} not found") + + harness = active_harnesses[harness_key] + + try: + snapshot = CodebaseContextSnapshot(harness=harness, bucket_store=bucket_store) + snapshot_id = snapshot.create_snapshot(local_path=local_path) + except Exception as e: + logger.error(f"Snapshot creation failed: {e}") + raise HTTPException(status_code=500, detail=str(e)) from e + else: + return { + "snapshot_id": snapshot_id, + "message": "Snapshot created successfully", + } + + +@app.get("/snapshot/list", response_model=list[SnapshotInfo]) +async def list_snapshots(): + """ + List all available snapshots. + """ + snapshots = [] + + # List snapshots from bucket store if available + if bucket_store: + try: + keys = bucket_store.list_keys(prefix="snapshots/") + for key in keys: + try: + data = bucket_store.get_json(key) + snapshots.append( + SnapshotInfo( + snapshot_id=data["snapshot_id"], + created_at=data["created_at"], + repo_info=data["repo_info"], + ) + ) + except Exception as e: + logger.warning(f"Failed to load snapshot {key}: {e}") + except Exception as e: + logger.warning(f"Failed to list snapshots from bucket store: {e}") + + # List local snapshots + for directory in [Path("./snapshots"), Path("./data/snapshots")]: + if directory.exists(): + for snapshot_file in directory.glob("snapshot_*.json"): + try: + with open(snapshot_file) as f: + data = json.load(f) + snapshots.append( + SnapshotInfo( + snapshot_id=data["snapshot_id"], + created_at=data["created_at"], + repo_info=data["repo_info"], + ) + ) + except Exception as e: + logger.warning( + f"Failed to load local snapshot {snapshot_file}: {e}" + ) + + return snapshots + + +@app.get("/snapshot/load/{snapshot_id}", response_model=dict[str, Any]) +async def load_snapshot(snapshot_id: str): + """ + Load a snapshot by ID and return its data. + """ + try: + snapshot = CodebaseContextSnapshot( + snapshot_id=snapshot_id, bucket_store=bucket_store + ) + data = snapshot.load_snapshot() + except Exception as e: + logger.error(f"Failed to load snapshot {snapshot_id}: {e}") + raise HTTPException( + status_code=404, detail=f"Snapshot {snapshot_id} not found" + ) from e + else: + return data + + +@app.post("/agent/run", response_model=dict[str, Any]) +async def run_agent( + request: AgentRunRequest, + harness_key: str = Query(..., description="Key of the active harness"), +): + """ + Run an agent on the codebase with the given prompt. + """ + if harness_key not in active_harnesses: + raise HTTPException(status_code=404, detail=f"Harness {harness_key} not found") + + harness = active_harnesses[harness_key] + + try: + # Update metadata if provided + if request.metadata: + harness.metadata.update(request.metadata) + + # Run the agent + result = harness.run_agent(prompt=request.prompt, model=request.model) + except Exception as e: + logger.error(f"Agent run failed: {e}") + raise HTTPException(status_code=500, detail=str(e)) from e + else: + return { + "harness_key": harness_key, + "result": result, + } + + +def create_app() -> FastAPI: + """Create and configure the FastAPI application.""" + return app diff --git a/codegen-on-oss/codegen_on_oss/snapshot/__init__.py b/codegen-on-oss/codegen_on_oss/snapshot/__init__.py new file mode 100644 index 000000000..228a38382 --- /dev/null +++ b/codegen-on-oss/codegen_on_oss/snapshot/__init__.py @@ -0,0 +1,7 @@ +""" +Snapshot module for codegen-on-oss. +""" + +from codegen_on_oss.snapshot.context_snapshot import CodebaseContextSnapshot + +__all__ = ["CodebaseContextSnapshot"] diff --git a/codegen-on-oss/codegen_on_oss/snapshot/context_snapshot.py b/codegen-on-oss/codegen_on_oss/snapshot/context_snapshot.py new file mode 100644 index 000000000..30b356223 --- /dev/null +++ b/codegen-on-oss/codegen_on_oss/snapshot/context_snapshot.py @@ -0,0 +1,262 @@ +""" +CodebaseContextSnapshot - Module for saving and restoring codebase state and analysis results. +""" + +import json +import uuid +from datetime import datetime +from pathlib import Path +from typing import Any + +from loguru import logger + +from codegen_on_oss.analysis.harness_integration import CodebaseAnalysisHarness +from codegen_on_oss.bucket_store import BucketStore + + +class NoBucketStoreError(ValueError): + """Error raised when no bucket store is configured for remote storage.""" + + pass + + +class SnapshotNotFoundError(FileNotFoundError): + """Error raised when a snapshot cannot be found locally.""" + + pass + + +class SnapshotLoadError(ValueError): + """Error raised when a snapshot could not be loaded.""" + + pass + + +class NoHarnessError(ValueError): + """Error raised when no harness is provided for snapshot creation.""" + + pass + + +class NoSnapshotIdError(ValueError): + """Error raised when no snapshot ID is provided.""" + + pass + + +class CodebaseContextSnapshot: + """ + Allows saving and restoring codebase state and analysis results. + Integrates with S3-compatible storage via BucketStore. + """ + + def __init__( + self, + harness: CodebaseAnalysisHarness | None = None, + snapshot_id: str | None = None, + bucket_store: BucketStore | None = None, + ): + """ + Initialize a CodebaseContextSnapshot. + + Args: + harness: Optional CodebaseAnalysisHarness to snapshot + snapshot_id: Optional existing snapshot ID to load + bucket_store: Optional BucketStore for remote storage + """ + self.harness = harness + self.snapshot_id = snapshot_id or str(uuid.uuid4()) + self.bucket_store = bucket_store + self.snapshot_data: dict[str, Any] = {} + + if snapshot_id and bucket_store: + self.load_snapshot() + + def create_snapshot(self, local_path: str | Path | None = None) -> str: + """ + Create a snapshot of the current codebase state and analysis results. + + Args: + local_path: Optional local path to save the snapshot + + Returns: + The snapshot ID + """ + if not self.harness: + raise NoHarnessError() + + # Ensure we have analysis results + if not self.harness.analysis_results: + logger.info("No analysis results found, running analysis...") + self.harness.analyze_codebase() + + # Create snapshot data + self.snapshot_data = { + "snapshot_id": self.snapshot_id, + "created_at": datetime.now().isoformat(), + "repo_info": { + "repo_name": self.harness.codebase.repo_name, + "commit": self.harness.base_commit, + }, + "analysis_results": self.harness.analysis_results, + } + + # Save locally if path provided + if local_path: + self._save_local(Path(local_path)) + + # Save to bucket store if available + if self.bucket_store: + self._save_remote() + + logger.info(f"Created snapshot with ID: {self.snapshot_id}") + return self.snapshot_id + + def load_snapshot(self, snapshot_id: str | None = None) -> dict[str, Any]: + """ + Load a snapshot by ID. + + Args: + snapshot_id: Optional snapshot ID to load (defaults to self.snapshot_id) + + Returns: + The loaded snapshot data + """ + snapshot_id = snapshot_id or self.snapshot_id + if not snapshot_id: + raise NoSnapshotIdError() + + # Try to load from bucket store first + if self.bucket_store: + try: + self.snapshot_data = self._load_remote(snapshot_id) + except Exception as e: + logger.warning(f"Failed to load snapshot from remote: {e}") + else: + logger.info(f"Loaded snapshot {snapshot_id} from remote storage") + return self.snapshot_data + + # Fall back to local storage + try: + self.snapshot_data = self._load_local(snapshot_id) + except Exception as e: + logger.error(f"Failed to load snapshot {snapshot_id}: {e}") + raise SnapshotLoadError() from e + else: + logger.info(f"Loaded snapshot {snapshot_id} from local storage") + return self.snapshot_data + + def save_to_remote(self) -> str: + """ + Save the snapshot to remote storage. + + Returns: + The snapshot ID. + + Raises: + NoBucketStoreError: If no bucket store is configured. + """ + if not self.bucket_store: + raise NoBucketStoreError() + + key = f"snapshots/snapshot_{self.snapshot_id}.json" + self.bucket_store.put_json(key, self.snapshot_data) + logger.debug(f"Saved snapshot to remote storage with key {key}") + return key + + def _save_local(self, directory: Path) -> Path: + """ + Save the snapshot to a local file. + + Args: + directory: Directory to save the snapshot in + + Returns: + Path to the saved snapshot file + """ + directory.mkdir(parents=True, exist_ok=True) + snapshot_path = directory / f"snapshot_{self.snapshot_id}.json" + + with open(snapshot_path, "w") as f: + json.dump(self.snapshot_data, f, indent=2) + + logger.debug(f"Saved snapshot to {snapshot_path}") + return snapshot_path + + def _load_local(self, snapshot_id: str) -> dict[str, Any]: + """ + Load a snapshot from a local file. + + Args: + snapshot_id: ID of the snapshot to load + + Returns: + The loaded snapshot data + + Raises: + SnapshotNotFoundError: If the snapshot cannot be found locally + """ + # Try common snapshot directories + for directory in [Path("./snapshots"), Path("./data/snapshots")]: + snapshot_path = directory / f"snapshot_{snapshot_id}.json" + if snapshot_path.exists(): + with open(snapshot_path) as f: + return json.load(f) + + raise SnapshotNotFoundError(snapshot_id) + + def _load_remote(self, snapshot_id: str) -> dict[str, Any]: + """ + Load a snapshot from remote storage. + + Args: + snapshot_id: ID of the snapshot to load + + Returns: + The loaded snapshot data + + Raises: + NoBucketStoreError: If no bucket store is configured + """ + if not self.bucket_store: + raise NoBucketStoreError() + + key = f"snapshots/snapshot_{snapshot_id}.json" + return self.bucket_store.get_json(key) + + def _save_remote(self) -> str: + """ + Save the snapshot to remote storage. + + Returns: + The key where the snapshot was saved. + + Raises: + NoBucketStoreError: If no bucket store is configured. + """ + return self.save_to_remote() + + @classmethod + def load_from_remote( + cls, snapshot_id: str, bucket_store: BucketStore + ) -> "CodebaseContextSnapshot": + """ + Load a snapshot from remote storage. + + Args: + snapshot_id: The ID of the snapshot to load. + bucket_store: The bucket store to use for loading. + + Returns: + A CodebaseContextSnapshot instance. + + Raises: + NoBucketStoreError: If no bucket store is provided. + SnapshotLoadError: If the snapshot could not be loaded. + """ + if not bucket_store: + raise NoBucketStoreError() + + snapshot = cls(snapshot_id=snapshot_id, bucket_store=bucket_store) + snapshot.load_snapshot() + return snapshot diff --git a/codegen-on-oss/codegen_on_oss/snapshot/event_handlers.py b/codegen-on-oss/codegen_on_oss/snapshot/event_handlers.py index 534dff186..e92f0c70c 100644 --- a/codegen-on-oss/codegen_on_oss/snapshot/event_handlers.py +++ b/codegen-on-oss/codegen_on_oss/snapshot/event_handlers.py @@ -1,16 +1,17 @@ +import logging +from typing import Literal + +import modal +from classy_fastapi import Routable, post from codegen.agents.code_agent import CodeAgent from codegen.extensions.events.codegen_app import CodegenApp -from codegen.extensions.linear.types import LinearEvent -from codegen.extensions.slack.types import SlackEvent from codegen.extensions.events.modal.base import CodebaseEventsApp, EventRouterMixin from codegen.extensions.github.types.pull_request import PullRequestLabeledEvent -from pr_tasks import lint_for_dev_import_violations -from typing import Literal +from codegen.extensions.linear.types import LinearEvent +from codegen.extensions.slack.types import SlackEvent from dotenv import load_dotenv from fastapi import FastAPI, Request -from classy_fastapi import Routable, post -import modal -import logging +from pr_tasks import lint_for_dev_import_violations load_dotenv(".env") @@ -41,7 +42,12 @@ event_handlers_app = modal.App("codegen-event-handlers") -@event_handlers_app.cls(image=base_image, secrets=[modal.Secret.from_dotenv(".env")], enable_memory_snapshot=True, container_idle_timeout=300) +@event_handlers_app.cls( + image=base_image, + secrets=[modal.Secret.from_dotenv(".env")], + enable_memory_snapshot=True, + container_idle_timeout=300, +) class CustomEventHandlersAPI(CodebaseEventsApp): commit: str = modal.parameter(default="79114f67ccfe8700416cd541d1c7c43659a95342") repo_org: str = modal.parameter(default="codegen-sh") @@ -64,9 +70,15 @@ async def handle_mention(event: SlackEvent): logger.info("[CODE_AGENT] Running code agent") response = agent.run(event.text) - cg.slack.client.chat_postMessage(channel=event.channel, text=response, thread_ts=event.ts) + cg.slack.client.chat_postMessage( + channel=event.channel, text=response, thread_ts=event.ts + ) - return {"message": "Mentioned", "received_text": event.text, "response": response} + return { + "message": "Mentioned", + "received_text": event.text, + "response": response, + } @cg.github.event("pull_request:labeled") def handle_pr(event: PullRequestLabeledEvent): @@ -74,7 +86,9 @@ def handle_pr(event: PullRequestLabeledEvent): logger.info(f"PR head sha: {event.pull_request.head.sha}") codebase = cg.get_codebase() - logger.info(f"Codebase: {codebase.name} codebase.repo: {codebase.repo_path}") + logger.info( + f"Codebase: {codebase.name} codebase.repo: {codebase.repo_path}" + ) # =====[ Check out commit ]===== # Might require fetch? @@ -85,13 +99,21 @@ def handle_pr(event: PullRequestLabeledEvent): # LINT CODEMOD lint_for_dev_import_violations(codebase, event) - return {"message": "PR event handled", "num_files": len(codebase.files), "num_functions": len(codebase.functions)} + return { + "message": "PR event handled", + "num_files": len(codebase.files), + "num_functions": len(codebase.functions), + } @cg.linear.event("Issue") def handle_issue(event: LinearEvent): logger.info(f"Issue created: {event}") codebase = cg.get_codebase() - return {"message": "Linear Issue event", "num_files": len(codebase.files), "num_functions": len(codebase.functions)} + return { + "message": "Linear Issue event", + "num_files": len(codebase.files), + "num_functions": len(codebase.functions), + } @codegen_events_app.cls(image=base_image, secrets=[modal.Secret.from_dotenv(".env")]) @@ -99,11 +121,19 @@ class WebhookEventRouterAPI(EventRouterMixin, Routable): snapshot_index_id: str = SNAPSHOT_DICT_ID def get_event_handler_cls(self): - modal_cls = modal.Cls.from_name(app_name="Events", name="CustomEventHandlersAPI") + modal_cls = modal.Cls.from_name( + app_name="Events", name="CustomEventHandlersAPI" + ) return modal_cls @post("/{org}/{repo}/{provider}/events") - async def handle_event(self, org: str, repo: str, provider: Literal["slack", "github", "linear"], request: Request): + async def handle_event( + self, + org: str, + repo: str, + provider: Literal["slack", "github", "linear"], + request: Request, + ): # Define the route for the webhook url sink, it will need to indicate the repo repo org, and the provider return await super().handle_event(org, repo, provider, request) @@ -117,9 +147,15 @@ def api(self): # Setup a cron job to trigger updates to the codebase snapshots. -@codegen_events_app.function(schedule=modal.Cron("*/10 * * * *"), image=base_image, secrets=[modal.Secret.from_dotenv(".env")]) +@codegen_events_app.function( + schedule=modal.Cron("*/10 * * * *"), + image=base_image, + secrets=[modal.Secret.from_dotenv(".env")], +) def refresh_repository_snapshots(): - WebhookEventRouterAPI().refresh_repository_snapshots(snapshot_index_id=SNAPSHOT_DICT_ID) + WebhookEventRouterAPI().refresh_repository_snapshots( + snapshot_index_id=SNAPSHOT_DICT_ID + ) app = modal.App("Events", secrets=[modal.Secret.from_dotenv(".env")]) diff --git a/codegen-on-oss/codegen_on_oss/snapshot/helpers.py b/codegen-on-oss/codegen_on_oss/snapshot/helpers.py index 9d94f3b37..8ac58d9b1 100644 --- a/codegen-on-oss/codegen_on_oss/snapshot/helpers.py +++ b/codegen-on-oss/codegen_on_oss/snapshot/helpers.py @@ -1,28 +1,27 @@ -from github import Github -from codegen.extensions.github.types.events.pull_request import PullRequestUnlabeledEvent -from logging import getLogger - +import logging import os +from logging import getLogger -from codegen import Codebase - -from codegen.extensions.github.types.events.pull_request import PullRequestLabeledEvent +from codegen import CodeAgent, Codebase from codegen.configs.models.secrets import SecretsConfig -from codegen import CodeAgent - +from codegen.extensions.github.types.events.pull_request import ( + PullRequestLabeledEvent, + PullRequestUnlabeledEvent, +) from codegen.extensions.langchain.tools import ( - # Github - GithubViewPRTool, GithubCreatePRCommentTool, GithubCreatePRReviewCommentTool, + # Github + GithubViewPRTool, ) - from dotenv import load_dotenv -import logging +from github import Github load_dotenv() -logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" +) logger = getLogger(__name__) @@ -60,7 +59,11 @@ def remove_bot_comments(event: PullRequestUnlabeledEvent): def pr_review_agent(event: PullRequestLabeledEvent) -> None: # Pull a subset of SWE bench repo_str = f"{event.organization.login}/{event.repository.name}" - codebase = Codebase.from_repo(repo_str, language="python", secrets=SecretsConfig(github_token=os.environ["GITHUB_TOKEN"])) + codebase = Codebase.from_repo( + repo_str, + language="python", + secrets=SecretsConfig(github_token=os.environ["GITHUB_TOKEN"]), + ) review_atention_message = "CodegenBot is starting to review the PR please wait..." comment = codebase._op.create_pr_comment(event.number, review_atention_message) # Define tools first diff --git a/codegen-on-oss/codegen_on_oss/snapshot/pr_review.py b/codegen-on-oss/codegen_on_oss/snapshot/pr_review.py index 35102af2f..c0150e7b5 100644 --- a/codegen-on-oss/codegen_on_oss/snapshot/pr_review.py +++ b/codegen-on-oss/codegen_on_oss/snapshot/pr_review.py @@ -1,12 +1,18 @@ import logging from logging import getLogger + import modal from codegen.extensions.events.app import CodegenApp +from codegen.extensions.github.types.events.pull_request import ( + PullRequestLabeledEvent, + PullRequestUnlabeledEvent, +) from fastapi import Request -from codegen.extensions.github.types.events.pull_request import PullRequestLabeledEvent, PullRequestUnlabeledEvent -from helpers import remove_bot_comments, pr_review_agent +from helpers import pr_review_agent, remove_bot_comments -logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" +) logger = getLogger(__name__) REPO_URL = "https://github.com/codegen-sh/codegen-sdk.git" diff --git a/codegen-on-oss/codegen_on_oss/snapshot/pr_tasks.py b/codegen-on-oss/codegen_on_oss/snapshot/pr_tasks.py index 0a412ec30..c4204cee0 100644 --- a/codegen-on-oss/codegen_on_oss/snapshot/pr_tasks.py +++ b/codegen-on-oss/codegen_on_oss/snapshot/pr_tasks.py @@ -1,105 +1,92 @@ import logging -from codegen.extensions.github.types.pull_request import PullRequestLabeledEvent +from codegen.extensions.github.types.pull_request import PullRequestLabeledEvent from codegen.sdk.core.codebase import Codebase logging.basicConfig(level=logging.INFO, force=True) logger = logging.getLogger(__name__) -def lint_for_dev_import_violations(codebase: Codebase, event: PullRequestLabeledEvent): - # Next.js codemod to detect imports of the react-dev-overlay module in production code +def _check_for_dev_imports(file_path: str, file_content: str) -> list[tuple[int, str]]: + """ + Check a file for development imports. - patch, commit_shas, modified_symbols = codebase.get_modified_symbols_in_pr(event.pull_request.number) - modified_files = set(commit_shas.keys()) - from codegen.sdk.core.statements.if_block_statement import IfBlockStatement - - DIR_NAME = "packages/next/src/client/components/react-dev-overlay" - directory = codebase.get_directory(DIR_NAME) + Args: + file_path: Path to the file + file_content: Content of the file + Returns: + List of tuples containing line number and violation message + """ violations = [] + lines = file_content.splitlines() + + # Skip checking in test files, stories, etc. + if any(pattern in file_path for pattern in ["/test/", "/tests/", "/stories/", "/mocks/"]): + return violations - false_operators = ["!=", "!=="] - true_operators = ["===", "=="] - - def is_valid_block_expression(if_block: IfBlockStatement) -> bool: - """Check if the if block has a valid environment check condition. - - Valid conditions are: - - process.env.NODE_ENV !== 'production' - - process.env.NODE_ENV != 'production' - - process.env.NODE_ENV === 'development' - - process.env.NODE_ENV == 'development' - """ - if not if_block.is_if_statement: - return False - - condition = if_block.condition - # Get the operator without any whitespace - operator = condition.operator[-1].source - - # Check for non-production conditions - if operator in false_operators and condition.source == f"process.env.NODE_ENV {operator} 'production'": - return True - - # Check for explicit development conditions - if operator in true_operators and condition.source == f"process.env.NODE_ENV {operator} 'development'": - return True - - return False - - def process_else_block_expression(else_block: IfBlockStatement) -> bool: - """Check if the else block is valid by checking its parent if block. - - Valid when the parent if block checks for production environment: - - if (process.env.NODE_ENV === 'production') { ... } else { } - - if (process.env.NODE_ENV == 'production') { ... } else { } - """ - if not else_block.is_else_statement: - return False - - main_if = else_block._main_if_block - if not main_if or not main_if.condition: - return False - - condition = main_if.condition - operator = condition.operator[-1].source - - # Valid if the main if block checks for production - return operator in true_operators and condition.source == f"process.env.NODE_ENV {operator} 'production'" - - for file in directory.files(recursive=True): - for imp in file.inbound_imports: - if imp.file.filepath not in modified_files: - # skip if the import is not in the pull request's modified files - continue - # Skip if the import is from within the target directory - if directory.dirpath in imp.file.filepath: - # "✅ Valid import" if the import is within the target directory - continue - - parent_if_block = imp.parent_of_type(IfBlockStatement) - - # Check if import is in a valid environment check block - if_block_valid = parent_if_block and is_valid_block_expression(parent_if_block) - else_block_valid = parent_if_block and process_else_block_expression(parent_if_block) - - # Skip if the import is properly guarded by environment checks - if if_block_valid or else_block_valid: - # "✅ Valid import" these are guarded by non prod checks - continue - - # Report invalid imports that aren't properly guarded - violation = f"- Violation in `{file.filepath}`: Importing from `{imp.file.filepath}` ([link]({imp.github_url}))" - violations.append(violation) - logger.info(f"Found violation: {violation}") + # Check for react-dev-overlay imports + for i, line in enumerate(lines): + line_num = i + 1 + if "react-dev-overlay" in line and "import" in line: + violations.append( + (line_num, "Development import 'react-dev-overlay' found in production code") + ) + + return violations + +def lint_for_dev_import_violations(codebase: Codebase, event: PullRequestLabeledEvent): + """ + Next.js codemod to detect imports of the react-dev-overlay module in production code. + + Args: + codebase: The codebase to analyze + event: The PR event that triggered this task + """ + violations = [] + # Get the files changed in the PR + changed_files = codebase.get_changed_files( + base_commit=event.pull_request.base.sha, + head_commit=event.pull_request.head.sha, + ) + + # Check each changed file for violations + for file_path in changed_files: + # Skip non-JS/TS files + if not file_path.endswith((".js", ".jsx", ".ts", ".tsx")): + continue + + try: + file_content = codebase.get_file_content(file_path) + file_violations = _check_for_dev_imports(file_path, file_content) + violations.extend([(file_path, *v) for v in file_violations]) + except Exception as e: + print(f"Error checking file {file_path}: {e}") + + # If violations found, comment on the PR if violations: - # Comment on PR with violations - review_attention_message = "## Dev Import Violations Found\n\n" - review_attention_message += "The following files have imports that violate development overlay rules:\n\n" - review_attention_message += "\n".join(violations) - review_attention_message += "\n\nPlease ensure that development imports are not imported in production code." - - # Create PR comment with the formatted message - codebase._op.create_pr_comment(event.pull_request.number, review_attention_message) + comment = "## Development Import Violations Found\n\n" + comment += "The following files contain imports of development modules that should not be used in production code:\n\n" + + for file_path, line_num, message in violations: + comment += f"- `{file_path}` (line {line_num}): {message}\n" + + comment += "\nPlease remove these imports before merging this PR." + + # Add the comment to the PR + codebase.github_client.create_pr_comment( + pr_number=event.pull_request.number, + body=comment, + ) + + return { + "status": "failure", + "message": "Development imports found in production code", + "violations": violations, + } + + return { + "status": "success", + "message": "No development imports found", + } diff --git a/codegen-on-oss/pyproject.toml b/codegen-on-oss/pyproject.toml index b4227c454..03a73e436 100644 --- a/codegen-on-oss/pyproject.toml +++ b/codegen-on-oss/pyproject.toml @@ -18,10 +18,13 @@ dependencies = [ "boto3>=1.36.21", "click>=8.1.8", "codegen>=0.6.2", + "fastapi>=0.110.0", "loguru>=0.7.3", "modal>=0.73.51", + "pydantic>=2.7.1", "pydantic-settings>=2.7.1", "pygithub>=2.5.0", + "uvicorn>=0.29.0", ] [project.urls] diff --git a/codegen-on-oss/scripts/example_usage.py b/codegen-on-oss/scripts/example_usage.py new file mode 100644 index 000000000..f2d6f980c --- /dev/null +++ b/codegen-on-oss/scripts/example_usage.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python +""" +Example script demonstrating how to use the CodebaseAnalysisHarness and CodebaseContextSnapshot. +""" + +import argparse +import json +from pathlib import Path + +from loguru import logger + +from codegen_on_oss.analysis import CodebaseAnalysisHarness +from codegen_on_oss.snapshot import CodebaseContextSnapshot + + +def analyze_repo(repo_name: str, output_dir: Path, commit: str | None = None): + """ + Analyze a repository and save the results. + + Args: + repo_name: The full name of the repository (e.g., "owner/repo") + output_dir: Directory to save the results + commit: Optional commit hash to analyze + """ + logger.info(f"Analyzing repository: {repo_name}") + + # Create the harness + harness = CodebaseAnalysisHarness.from_repo( + repo_full_name=repo_name, + commit=commit, + ) + + # Analyze the codebase + results = harness.analyze_codebase() + + # Save the results + output_dir.mkdir(parents=True, exist_ok=True) + output_path = output_dir / f"{repo_name.replace('/', '_')}_analysis.json" + + with open(output_path, "w") as f: + json.dump(results, f, indent=2) + + logger.info(f"Analysis results saved to {output_path}") + + # Create a snapshot + snapshot = CodebaseContextSnapshot(harness=harness) + snapshot_id = snapshot.create_snapshot(local_path=output_dir / "snapshots") + + logger.info(f"Created snapshot with ID: {snapshot_id}") + + return results, snapshot_id + + +def main(): + parser = argparse.ArgumentParser(description="Analyze a GitHub repository") + parser.add_argument("repo", help="Repository name (e.g., 'owner/repo')") + parser.add_argument("--commit", help="Commit hash to analyze") + parser.add_argument("--output-dir", default="./output", help="Output directory") + + args = parser.parse_args() + + analyze_repo( + repo_name=args.repo, + output_dir=Path(args.output_dir), + commit=args.commit, + ) + + +if __name__ == "__main__": + main() diff --git a/docs/mint.json b/docs/mint.json index 737c098ee..e83d7a630 100644 --- a/docs/mint.json +++ b/docs/mint.json @@ -1,396 +1,394 @@ { - "$schema": "https://mintlify.com/schema.json", - "name": "Codegen", - "logo": { - "dark": "https://cdn.prod.website-files.com/67070304751b9b01bf6a161c/679bcf45a3e32761c42b324b_Codegen_Logomark_Dark.svg", - "light": "https://cdn.prod.website-files.com/67070304751b9b01bf6a161c/679bcf45bf55446746125835_Codegen_Logomark_Light.svg" - }, - "modeToggle": { - "default": "dark" - }, - "metadata": { - "og:site_name": "Codegen", - "og:title": "Codegen - Manipulate Code at Scale", - "og:description": "A scriptable interface to a powerful, multi-lingual language server built on top of Tree-sitter.", - "og:url": "https://docs.codegen.com", - "og:locale": "en_US", - "og:logo": "https://i.imgur.com/f4OVOqI.png", - "article:publisher": "Codegen, Inc.", - "twitter:site": "@codegen" - }, - "favicon": "/favicon.svg", - "colors": { - "primary": "#a277ff", - "light": "#a277ff", - "dark": "#a277ff", - "anchors": { - "from": "#61ffca", - "to": "#61ffca" - } - }, - "theme": "prism", - "background": { - "style": "gradient" - }, - "analytics": { - "posthog": { - "apiKey": "phc_GLxaINoQJnuyCyxDmTciQqzdKBYFVDkY7bRBO4bDdso" - } - }, - "feedback": { - "thumbsRating": true - }, - "topbarCtaButton": { - "name": "GitHub", - "url": "https://github.com/codegen-sh/codegen-sdk" - }, - "tabs": [ - { - "name": "API Reference", - "url": "/api-reference" - }, - { - "name": "CLI", - "url": "/cli" - }, - { - "name": "Blog", - "url": "/blog" - }, - { - "name": "Changelog", - "url": "/changelog" - }, - { - "name": "codegen", - "url": "/gen" - } - ], - "navigation": [ - { - "group": "Introduction", - "pages": [ - "introduction/overview", - "introduction/getting-started", - "introduction/installation", - "introduction/ide-usage", - "introduction/work-with-ai", - "introduction/how-it-works", - "introduction/advanced-settings", - "introduction/guiding-principles", - "introduction/community", - "introduction/about", - "introduction/faq" - ] - }, - { - "group": "Tutorials", - "pages": [ - "tutorials/at-a-glance", - "tutorials/build-code-agent", - "tutorials/slack-bot", - "tutorials/github-review-bot", - "tutorials/deep-code-research", - "tutorials/codebase-analytics-dashboard", - "tutorials/training-data", - "tutorials/codebase-visualization", - "tutorials/migrating-apis", - "tutorials/organize-your-codebase", - "tutorials/promise-to-async-await", - "tutorials/modularity", - "tutorials/manage-feature-flags", - "tutorials/deleting-dead-code", - "tutorials/increase-type-coverage", - "tutorials/managing-typescript-exports", - "tutorials/converting-default-exports", - "tutorials/creating-documentation", - "tutorials/react-modernization", - "tutorials/unittest-to-pytest", - "tutorials/sqlalchemy-1.6-to-2.0", - "tutorials/fixing-import-loops-in-pytorch", - "tutorials/python2-to-python3", - "tutorials/flask-to-fastapi", - "tutorials/build-mcp", - "tutorials/neo4j-graph", - "tutorials/attributions" - ] - }, - { - "group": "Building with Codegen", - "pages": [ - "building-with-codegen/at-a-glance", - "building-with-codegen/parsing-codebases", - "building-with-codegen/reusable-codemods", - "building-with-codegen/dot-codegen", - "building-with-codegen/function-decorator", - "building-with-codegen/language-support", - "building-with-codegen/commit-and-reset", - "building-with-codegen/git-operations", - "building-with-codegen/files-and-directories", - "building-with-codegen/the-editable-api", - "building-with-codegen/symbol-api", - "building-with-codegen/class-api", - "building-with-codegen/imports", - "building-with-codegen/exports", - "building-with-codegen/inheritable-behaviors", - "building-with-codegen/statements-and-code-blocks", - "building-with-codegen/dependencies-and-usages", - "building-with-codegen/function-calls-and-callsites", - "building-with-codegen/variable-assignments", - "building-with-codegen/local-variables", - "building-with-codegen/comments-and-docstrings", - "building-with-codegen/external-modules", - "building-with-codegen/type-annotations", - "building-with-codegen/moving-symbols", - "building-with-codegen/collections", - "building-with-codegen/traversing-the-call-graph", - "building-with-codegen/react-and-jsx", - "building-with-codegen/codebase-visualization", - "building-with-codegen/flagging-symbols", - "building-with-codegen/calling-out-to-llms", - "building-with-codegen/semantic-code-search", - "building-with-codegen/reducing-conditions" - ] - }, - { - "group": "CLI", - "pages": [ - "cli/about", - "cli/init", - "cli/notebook", - "cli/create", - "cli/run", - "cli/reset", - "cli/expert" - ] - }, - { - "group": "Changelog", - "pages": [ - "changelog/changelog" - ] - }, - { - "group": "Blog", - "pages": [ - "blog/posts", - "blog/devin", - "blog/act-via-code", - "blog/promise-to-async-await-twilio", - "blog/fixing-import-loops" - ] - }, - { - "group": "codegen", - "pages": [ - "gen/introduction", - "gen/capabilities", - "gen/integrations", - "gen/faq" - ] - }, - { - "group": "API Reference", - "pages": [ - "api-reference/index", - { - "group": "Core", - "icon": "code", - "pages": [ - "api-reference/core/Argument", - "api-reference/core/Assignment", - "api-reference/core/AssignmentStatement", - "api-reference/core/Attribute", - "api-reference/core/AwaitExpression", - "api-reference/core/BinaryExpression", - "api-reference/core/BlockStatement", - "api-reference/core/Boolean", - "api-reference/core/Callable", - "api-reference/core/CatchStatement", - "api-reference/core/ChainedAttribute", - "api-reference/core/Class", - "api-reference/core/CodeBlock", - "api-reference/core/CodeOwner", - "api-reference/core/Codebase", - "api-reference/core/Comment", - "api-reference/core/CommentGroup", - "api-reference/core/ComparisonExpression", - "api-reference/core/Decorator", - "api-reference/core/Dict", - "api-reference/core/Directory", - "api-reference/core/Editable", - "api-reference/core/Export", - "api-reference/core/ExportStatement", - "api-reference/core/Exportable", - "api-reference/core/Expression", - "api-reference/core/ExpressionGroup", - "api-reference/core/ExpressionStatement", - "api-reference/core/ExternalModule", - "api-reference/core/File", - "api-reference/core/FlagKwargs", - "api-reference/core/ForLoopStatement", - "api-reference/core/Function", - "api-reference/core/FunctionCall", - "api-reference/core/GenericType", - "api-reference/core/HasBlock", - "api-reference/core/HasName", - "api-reference/core/HasValue", - "api-reference/core/IfBlockStatement", - "api-reference/core/Import", - "api-reference/core/ImportStatement", - "api-reference/core/ImportType", - "api-reference/core/Importable", - "api-reference/core/Interface", - "api-reference/core/List", - "api-reference/core/MessageType", - "api-reference/core/MultiExpression", - "api-reference/core/MultiLineCollection", - "api-reference/core/Name", - "api-reference/core/NamedType", - "api-reference/core/NoneType", - "api-reference/core/Number", - "api-reference/core/Pair", - "api-reference/core/Parameter", - "api-reference/core/ParenthesizedExpression", - "api-reference/core/Placeholder", - "api-reference/core/PlaceholderType", - "api-reference/core/RaiseStatement", - "api-reference/core/ReturnStatement", - "api-reference/core/SourceFile", - "api-reference/core/Span", - "api-reference/core/Statement", - "api-reference/core/StatementType", - "api-reference/core/String", - "api-reference/core/StubPlaceholder", - "api-reference/core/SubscriptExpression", - "api-reference/core/SwitchCase", - "api-reference/core/SwitchStatement", - "api-reference/core/Symbol", - "api-reference/core/SymbolGroup", - "api-reference/core/SymbolStatement", - "api-reference/core/TernaryExpression", - "api-reference/core/TryCatchStatement", - "api-reference/core/Tuple", - "api-reference/core/TupleType", - "api-reference/core/Type", - "api-reference/core/TypeAlias", - "api-reference/core/TypePlaceholder", - "api-reference/core/Typeable", - "api-reference/core/UnaryExpression", - "api-reference/core/UnionType", - "api-reference/core/Unpack", - "api-reference/core/Unwrappable", - "api-reference/core/Usable", - "api-reference/core/Usage", - "api-reference/core/UsageKind", - "api-reference/core/UsageType", - "api-reference/core/Value", - "api-reference/core/WhileStatement", - "api-reference/core/WithStatement" - ] - }, - { - "group": "Python", - "icon": "python", - "pages": [ - "api-reference/python/PyAssignment", - "api-reference/python/PyAssignmentStatement", - "api-reference/python/PyAttribute", - "api-reference/python/PyBlockStatement", - "api-reference/python/PyBreakStatement", - "api-reference/python/PyCatchStatement", - "api-reference/python/PyChainedAttribute", - "api-reference/python/PyClass", - "api-reference/python/PyCodeBlock", - "api-reference/python/PyComment", - "api-reference/python/PyCommentGroup", - "api-reference/python/PyCommentType", - "api-reference/python/PyConditionalExpression", - "api-reference/python/PyDecorator", - "api-reference/python/PyFile", - "api-reference/python/PyForLoopStatement", - "api-reference/python/PyFunction", - "api-reference/python/PyGenericType", - "api-reference/python/PyHasBlock", - "api-reference/python/PyIfBlockStatement", - "api-reference/python/PyImport", - "api-reference/python/PyImportStatement", - "api-reference/python/PyMatchCase", - "api-reference/python/PyMatchStatement", - "api-reference/python/PyNamedType", - "api-reference/python/PyParameter", - "api-reference/python/PyPassStatement", - "api-reference/python/PyReturnTypePlaceholder", - "api-reference/python/PyString", - "api-reference/python/PySymbol", - "api-reference/python/PyTryCatchStatement", - "api-reference/python/PyUnionType", - "api-reference/python/PyWhileStatement" - ] - }, - { - "group": "Typescript", - "icon": "js", - "pages": [ - "api-reference/typescript/JSXElement", - "api-reference/typescript/JSXExpression", - "api-reference/typescript/JSXProp", - "api-reference/typescript/TSArrayType", - "api-reference/typescript/TSAssignment", - "api-reference/typescript/TSAssignmentStatement", - "api-reference/typescript/TSAttribute", - "api-reference/typescript/TSBlockStatement", - "api-reference/typescript/TSCatchStatement", - "api-reference/typescript/TSChainedAttribute", - "api-reference/typescript/TSClass", - "api-reference/typescript/TSCodeBlock", - "api-reference/typescript/TSComment", - "api-reference/typescript/TSCommentGroup", - "api-reference/typescript/TSCommentType", - "api-reference/typescript/TSConditionalType", - "api-reference/typescript/TSConfig", - "api-reference/typescript/TSDecorator", - "api-reference/typescript/TSDict", - "api-reference/typescript/TSEnum", - "api-reference/typescript/TSExport", - "api-reference/typescript/TSExpressionType", - "api-reference/typescript/TSFile", - "api-reference/typescript/TSForLoopStatement", - "api-reference/typescript/TSFunction", - "api-reference/typescript/TSFunctionType", - "api-reference/typescript/TSGenericType", - "api-reference/typescript/TSHasBlock", - "api-reference/typescript/TSIfBlockStatement", - "api-reference/typescript/TSImport", - "api-reference/typescript/TSImportStatement", - "api-reference/typescript/TSInterface", - "api-reference/typescript/TSLabeledStatement", - "api-reference/typescript/TSLookupType", - "api-reference/typescript/TSNamedType", - "api-reference/typescript/TSNamespace", - "api-reference/typescript/TSObjectType", - "api-reference/typescript/TSPair", - "api-reference/typescript/TSParameter", - "api-reference/typescript/TSQueryType", - "api-reference/typescript/TSReadonlyType", - "api-reference/typescript/TSReturnTypePlaceholder", - "api-reference/typescript/TSString", - "api-reference/typescript/TSSwitchCase", - "api-reference/typescript/TSSwitchStatement", - "api-reference/typescript/TSSymbol", - "api-reference/typescript/TSTernaryExpression", - "api-reference/typescript/TSTryCatchStatement", - "api-reference/typescript/TSTypeAlias", - "api-reference/typescript/TSUndefinedType", - "api-reference/typescript/TSUnionType", - "api-reference/typescript/TSWhileStatement" - ] - } - ] - } - ], - "footerSocials": { - "x": "https://x.com/codegen", - "linkedin": "https://linkedin.com/company/codegen-dot-com" - } -} \ No newline at end of file + "$schema": "https://mintlify.com/schema.json", + "name": "Codegen", + "logo": { + "dark": "https://cdn.prod.website-files.com/67070304751b9b01bf6a161c/679bcf45a3e32761c42b324b_Codegen_Logomark_Dark.svg", + "light": "https://cdn.prod.website-files.com/67070304751b9b01bf6a161c/679bcf45bf55446746125835_Codegen_Logomark_Light.svg" + }, + "modeToggle": { + "default": "dark" + }, + "metadata": { + "og:site_name": "Codegen", + "og:title": "Codegen - Manipulate Code at Scale", + "og:description": "A scriptable interface to a powerful, multi-lingual language server built on top of Tree-sitter.", + "og:url": "https://docs.codegen.com", + "og:locale": "en_US", + "og:logo": "https://i.imgur.com/f4OVOqI.png", + "article:publisher": "Codegen, Inc.", + "twitter:site": "@codegen" + }, + "favicon": "/favicon.svg", + "colors": { + "primary": "#a277ff", + "light": "#a277ff", + "dark": "#a277ff", + "anchors": { + "from": "#61ffca", + "to": "#61ffca" + } + }, + "theme": "prism", + "background": { + "style": "gradient" + }, + "analytics": { + "posthog": { + "apiKey": "phc_GLxaINoQJnuyCyxDmTciQqzdKBYFVDkY7bRBO4bDdso" + } + }, + "feedback": { + "thumbsRating": true + }, + "topbarCtaButton": { + "name": "GitHub", + "url": "https://github.com/codegen-sh/codegen-sdk" + }, + "tabs": [ + { + "name": "API Reference", + "url": "/api-reference" + }, + { + "name": "CLI", + "url": "/cli" + }, + { + "name": "Blog", + "url": "/blog" + }, + { + "name": "Changelog", + "url": "/changelog" + }, + { + "name": "codegen", + "url": "/gen" + } + ], + "navigation": [ + { + "group": "Introduction", + "pages": [ + "introduction/overview", + "introduction/getting-started", + "introduction/installation", + "introduction/ide-usage", + "introduction/work-with-ai", + "introduction/how-it-works", + "introduction/advanced-settings", + "introduction/guiding-principles", + "introduction/community", + "introduction/about", + "introduction/faq" + ] + }, + { + "group": "Tutorials", + "pages": [ + "tutorials/at-a-glance", + "tutorials/build-code-agent", + "tutorials/slack-bot", + "tutorials/github-review-bot", + "tutorials/deep-code-research", + "tutorials/codebase-analytics-dashboard", + "tutorials/training-data", + "tutorials/codebase-visualization", + "tutorials/migrating-apis", + "tutorials/organize-your-codebase", + "tutorials/promise-to-async-await", + "tutorials/modularity", + "tutorials/manage-feature-flags", + "tutorials/deleting-dead-code", + "tutorials/increase-type-coverage", + "tutorials/managing-typescript-exports", + "tutorials/converting-default-exports", + "tutorials/creating-documentation", + "tutorials/react-modernization", + "tutorials/unittest-to-pytest", + "tutorials/sqlalchemy-1.6-to-2.0", + "tutorials/fixing-import-loops-in-pytorch", + "tutorials/python2-to-python3", + "tutorials/flask-to-fastapi", + "tutorials/build-mcp", + "tutorials/neo4j-graph", + "tutorials/attributions" + ] + }, + { + "group": "Building with Codegen", + "pages": [ + "building-with-codegen/at-a-glance", + "building-with-codegen/parsing-codebases", + "building-with-codegen/reusable-codemods", + "building-with-codegen/dot-codegen", + "building-with-codegen/function-decorator", + "building-with-codegen/language-support", + "building-with-codegen/commit-and-reset", + "building-with-codegen/git-operations", + "building-with-codegen/files-and-directories", + "building-with-codegen/the-editable-api", + "building-with-codegen/symbol-api", + "building-with-codegen/class-api", + "building-with-codegen/imports", + "building-with-codegen/exports", + "building-with-codegen/inheritable-behaviors", + "building-with-codegen/statements-and-code-blocks", + "building-with-codegen/dependencies-and-usages", + "building-with-codegen/function-calls-and-callsites", + "building-with-codegen/variable-assignments", + "building-with-codegen/local-variables", + "building-with-codegen/comments-and-docstrings", + "building-with-codegen/external-modules", + "building-with-codegen/type-annotations", + "building-with-codegen/moving-symbols", + "building-with-codegen/collections", + "building-with-codegen/traversing-the-call-graph", + "building-with-codegen/react-and-jsx", + "building-with-codegen/codebase-visualization", + "building-with-codegen/flagging-symbols", + "building-with-codegen/calling-out-to-llms", + "building-with-codegen/semantic-code-search", + "building-with-codegen/reducing-conditions" + ] + }, + { + "group": "CLI", + "pages": [ + "cli/about", + "cli/init", + "cli/notebook", + "cli/create", + "cli/run", + "cli/reset", + "cli/expert" + ] + }, + { + "group": "Changelog", + "pages": ["changelog/changelog"] + }, + { + "group": "Blog", + "pages": [ + "blog/posts", + "blog/devin", + "blog/act-via-code", + "blog/promise-to-async-await-twilio", + "blog/fixing-import-loops" + ] + }, + { + "group": "codegen", + "pages": [ + "gen/introduction", + "gen/capabilities", + "gen/integrations", + "gen/faq" + ] + }, + { + "group": "API Reference", + "pages": [ + "api-reference/index", + { + "group": "Core", + "icon": "code", + "pages": [ + "api-reference/core/Argument", + "api-reference/core/Assignment", + "api-reference/core/AssignmentStatement", + "api-reference/core/Attribute", + "api-reference/core/AwaitExpression", + "api-reference/core/BinaryExpression", + "api-reference/core/BlockStatement", + "api-reference/core/Boolean", + "api-reference/core/Callable", + "api-reference/core/CatchStatement", + "api-reference/core/ChainedAttribute", + "api-reference/core/Class", + "api-reference/core/CodeBlock", + "api-reference/core/CodeOwner", + "api-reference/core/Codebase", + "api-reference/core/Comment", + "api-reference/core/CommentGroup", + "api-reference/core/ComparisonExpression", + "api-reference/core/Decorator", + "api-reference/core/Dict", + "api-reference/core/Directory", + "api-reference/core/Editable", + "api-reference/core/Export", + "api-reference/core/ExportStatement", + "api-reference/core/Exportable", + "api-reference/core/Expression", + "api-reference/core/ExpressionGroup", + "api-reference/core/ExpressionStatement", + "api-reference/core/ExternalModule", + "api-reference/core/File", + "api-reference/core/FlagKwargs", + "api-reference/core/ForLoopStatement", + "api-reference/core/Function", + "api-reference/core/FunctionCall", + "api-reference/core/GenericType", + "api-reference/core/HasBlock", + "api-reference/core/HasName", + "api-reference/core/HasValue", + "api-reference/core/IfBlockStatement", + "api-reference/core/Import", + "api-reference/core/ImportStatement", + "api-reference/core/ImportType", + "api-reference/core/Importable", + "api-reference/core/Interface", + "api-reference/core/List", + "api-reference/core/MessageType", + "api-reference/core/MultiExpression", + "api-reference/core/MultiLineCollection", + "api-reference/core/Name", + "api-reference/core/NamedType", + "api-reference/core/NoneType", + "api-reference/core/Number", + "api-reference/core/Pair", + "api-reference/core/Parameter", + "api-reference/core/ParenthesizedExpression", + "api-reference/core/Placeholder", + "api-reference/core/PlaceholderType", + "api-reference/core/RaiseStatement", + "api-reference/core/ReturnStatement", + "api-reference/core/SourceFile", + "api-reference/core/Span", + "api-reference/core/Statement", + "api-reference/core/StatementType", + "api-reference/core/String", + "api-reference/core/StubPlaceholder", + "api-reference/core/SubscriptExpression", + "api-reference/core/SwitchCase", + "api-reference/core/SwitchStatement", + "api-reference/core/Symbol", + "api-reference/core/SymbolGroup", + "api-reference/core/SymbolStatement", + "api-reference/core/TernaryExpression", + "api-reference/core/TryCatchStatement", + "api-reference/core/Tuple", + "api-reference/core/TupleType", + "api-reference/core/Type", + "api-reference/core/TypeAlias", + "api-reference/core/TypePlaceholder", + "api-reference/core/Typeable", + "api-reference/core/UnaryExpression", + "api-reference/core/UnionType", + "api-reference/core/Unpack", + "api-reference/core/Unwrappable", + "api-reference/core/Usable", + "api-reference/core/Usage", + "api-reference/core/UsageKind", + "api-reference/core/UsageType", + "api-reference/core/Value", + "api-reference/core/WhileStatement", + "api-reference/core/WithStatement" + ] + }, + { + "group": "Python", + "icon": "python", + "pages": [ + "api-reference/python/PyAssignment", + "api-reference/python/PyAssignmentStatement", + "api-reference/python/PyAttribute", + "api-reference/python/PyBlockStatement", + "api-reference/python/PyBreakStatement", + "api-reference/python/PyCatchStatement", + "api-reference/python/PyChainedAttribute", + "api-reference/python/PyClass", + "api-reference/python/PyCodeBlock", + "api-reference/python/PyComment", + "api-reference/python/PyCommentGroup", + "api-reference/python/PyCommentType", + "api-reference/python/PyConditionalExpression", + "api-reference/python/PyDecorator", + "api-reference/python/PyFile", + "api-reference/python/PyForLoopStatement", + "api-reference/python/PyFunction", + "api-reference/python/PyGenericType", + "api-reference/python/PyHasBlock", + "api-reference/python/PyIfBlockStatement", + "api-reference/python/PyImport", + "api-reference/python/PyImportStatement", + "api-reference/python/PyMatchCase", + "api-reference/python/PyMatchStatement", + "api-reference/python/PyNamedType", + "api-reference/python/PyParameter", + "api-reference/python/PyPassStatement", + "api-reference/python/PyReturnTypePlaceholder", + "api-reference/python/PyString", + "api-reference/python/PySymbol", + "api-reference/python/PyTryCatchStatement", + "api-reference/python/PyUnionType", + "api-reference/python/PyWhileStatement" + ] + }, + { + "group": "Typescript", + "icon": "js", + "pages": [ + "api-reference/typescript/JSXElement", + "api-reference/typescript/JSXExpression", + "api-reference/typescript/JSXProp", + "api-reference/typescript/TSArrayType", + "api-reference/typescript/TSAssignment", + "api-reference/typescript/TSAssignmentStatement", + "api-reference/typescript/TSAttribute", + "api-reference/typescript/TSBlockStatement", + "api-reference/typescript/TSCatchStatement", + "api-reference/typescript/TSChainedAttribute", + "api-reference/typescript/TSClass", + "api-reference/typescript/TSCodeBlock", + "api-reference/typescript/TSComment", + "api-reference/typescript/TSCommentGroup", + "api-reference/typescript/TSCommentType", + "api-reference/typescript/TSConditionalType", + "api-reference/typescript/TSConfig", + "api-reference/typescript/TSDecorator", + "api-reference/typescript/TSDict", + "api-reference/typescript/TSEnum", + "api-reference/typescript/TSExport", + "api-reference/typescript/TSExpressionType", + "api-reference/typescript/TSFile", + "api-reference/typescript/TSForLoopStatement", + "api-reference/typescript/TSFunction", + "api-reference/typescript/TSFunctionType", + "api-reference/typescript/TSGenericType", + "api-reference/typescript/TSHasBlock", + "api-reference/typescript/TSIfBlockStatement", + "api-reference/typescript/TSImport", + "api-reference/typescript/TSImportStatement", + "api-reference/typescript/TSInterface", + "api-reference/typescript/TSLabeledStatement", + "api-reference/typescript/TSLookupType", + "api-reference/typescript/TSNamedType", + "api-reference/typescript/TSNamespace", + "api-reference/typescript/TSObjectType", + "api-reference/typescript/TSPair", + "api-reference/typescript/TSParameter", + "api-reference/typescript/TSQueryType", + "api-reference/typescript/TSReadonlyType", + "api-reference/typescript/TSReturnTypePlaceholder", + "api-reference/typescript/TSString", + "api-reference/typescript/TSSwitchCase", + "api-reference/typescript/TSSwitchStatement", + "api-reference/typescript/TSSymbol", + "api-reference/typescript/TSTernaryExpression", + "api-reference/typescript/TSTryCatchStatement", + "api-reference/typescript/TSTypeAlias", + "api-reference/typescript/TSUndefinedType", + "api-reference/typescript/TSUnionType", + "api-reference/typescript/TSWhileStatement" + ] + } + ] + } + ], + "footerSocials": { + "x": "https://x.com/codegen", + "linkedin": "https://linkedin.com/company/codegen-dot-com" + } +} diff --git a/src/codegen/agents/client/openapi_client/api_client.py b/src/codegen/agents/client/openapi_client/api_client.py index 3eaff5023..69e9baf2f 100644 --- a/src/codegen/agents/client/openapi_client/api_client.py +++ b/src/codegen/agents/client/openapi_client/api_client.py @@ -630,7 +630,7 @@ def __deserialize_datetime(self, string): :return: datetime. """ try: - return datetime.datetime.fromisoformat(string.replace('Z', '+00:00')) + return datetime.datetime.fromisoformat(string.replace("Z", "+00:00")) except ValueError: raise rest.ApiException(status=0, reason=(f"Failed to parse `{string}` as datetime object")) diff --git a/src/codegen/extensions/tools/search_files_by_name.py b/src/codegen/extensions/tools/search_files_by_name.py index b44f6da85..d28df5ba9 100644 --- a/src/codegen/extensions/tools/search_files_by_name.py +++ b/src/codegen/extensions/tools/search_files_by_name.py @@ -1,7 +1,7 @@ import math import shutil import subprocess -from typing import ClassVar, Optional +from typing import ClassVar from pydantic import Field @@ -88,13 +88,12 @@ def search_files_by_name( if files_per_page == math.inf: files_per_page = total_files total_pages = 1 - else: + else: total_pages = (total_files + files_per_page - 1) // files_per_page if total_files > 0 else 1 - - + # Ensure page is within valid range page = min(page, total_pages) - + # Get paginated results start_idx = (page - 1) * files_per_page end_idx = start_idx + files_per_page diff --git a/src/codegen/sdk/system-prompt.txt b/src/codegen/sdk/system-prompt.txt index f72a67db1..698d8d762 100644 --- a/src/codegen/sdk/system-prompt.txt +++ b/src/codegen/sdk/system-prompt.txt @@ -657,7 +657,7 @@ Codegen creates a custom Python environment in `.codegen/.venv`. Configure your ```bash .codegen/.venv/bin/python ``` - + Alternatively, create a `.vscode/settings.json`: ```json { @@ -679,7 +679,7 @@ Codegen creates a custom Python environment in `.codegen/.venv`. Configure your .codegen/.venv/bin/python ``` - + @@ -1630,8 +1630,8 @@ iconType: "solid" - Yes - [by design](/introduction/guiding-principles#python-first-composability). - + Yes - [by design](/introduction/guiding-principles#python-first-composability). + Codegen works like any other python package. It works alongside your IDE, version control system, and other development tools. - Currently, the codebase object can only parse source code files of one language at a time. This means that if you want to work with both Python and TypeScript files, you will need to create two separate codebase objects. + Currently, the codebase object can only parse source code files of one language at a time. This means that if you want to work with both Python and TypeScript files, you will need to create two separate codebase objects. ## Accessing Code @@ -3407,7 +3407,7 @@ for module, imports in module_imports.items(): Always check if imports resolve to external modules before modification to avoid breaking third-party package imports. - + ## Import Statements vs Imports @@ -3609,7 +3609,7 @@ for exp in file.exports: # Get original and current symbols current = exp.exported_symbol original = exp.resolved_symbol - + print(f"Re-exporting {original.name} from {exp.from_file.filepath}") print(f"Through: {' -> '.join(e.file.filepath for e in exp.export_chain)}") ``` @@ -3659,7 +3659,7 @@ for from_file, exports in file_exports.items(): When managing exports, consider the impact on your module's public API. Not all symbols that can be exported should be exported. - + --- title: "Inheritable Behaviors" @@ -4149,9 +4149,9 @@ If `A` depends on `B`, then `B` is used by `A`. This relationship is tracked in flowchart LR B(BaseClass) - - - + + + A(MyClass) B ---| used by |A A ---|depends on |B @@ -4320,7 +4320,7 @@ class A: def method_a(self): pass class B(A): - def method_b(self): + def method_b(self): self.method_a() class C(B): @@ -5210,7 +5210,7 @@ for attr in class_def.attributes: # Each attribute has an assignment property attr_type = attr.assignment.type # -> TypeAnnotation print(f"{attr.name}: {attr_type.source}") # e.g. "x: int" - + # Set attribute type attr.assignment.set_type("int") @@ -5227,7 +5227,7 @@ Union types ([UnionType](/api-reference/core/UnionType)) can be manipulated as c ```python # Get union type -union_type = function.return_type # -> A | B +union_type = function.return_type # -> A | B print(union_type.symbols) # ["A", "B"] # Add/remove options @@ -6078,13 +6078,13 @@ Here's an example of using flags during code analysis: ```python def analyze_codebase(codebase): - for function in codebase.functions: + for function in codebase.functions: # Check documentation if not function.docstring: function.flag( message="Missing docstring", ) - + # Check error handling if function.is_async and not function.has_try_catch: function.flag( @@ -6794,7 +6794,7 @@ Explore our tutorials to learn how to use Codegen for various code transformatio > Update API calls, handle breaking changes, and manage bulk updates across your codebase. - Convert Flask applications to FastAPI, updating routes and dependencies. - Migrate Python 2 code to Python 3, updating syntax and modernizing APIs. @@ -6827,9 +6827,9 @@ Explore our tutorials to learn how to use Codegen for various code transformatio > Restructure files, enforce naming conventions, and improve project layout. - Split large files, extract shared logic, and manage dependencies. @@ -6927,7 +6927,7 @@ The agent has access to powerful code viewing and manipulation tools powered by - `CreateFileTool`: Create new files - `DeleteFileTool`: Delete files - `RenameFileTool`: Rename files -- `EditFileTool`: Edit files +- `EditFileTool`: Edit files @@ -7434,7 +7434,7 @@ Be explicit about the changes, produce a short summary, and point out possible i Focus on facts and technical details, using code snippets where helpful. """ result = agent.run(prompt) - + # Clean up the temporary comment comment.delete() ``` @@ -7615,21 +7615,21 @@ def research(repo_name: Optional[str] = None, query: Optional[str] = None): """Start a code research session.""" # Initialize codebase codebase = initialize_codebase(repo_name) - + # Create and run the agent agent = create_research_agent(codebase) - + # Main research loop while True: if not query: query = Prompt.ask("[bold cyan]Research query[/bold cyan]") - + result = agent.invoke( {"input": query}, config={"configurable": {"thread_id": 1}} ) console.print(Markdown(result["messages"][-1].content)) - + query = None # Clear for next iteration ``` @@ -7677,7 +7677,7 @@ class CustomAnalysisTool(BaseTool): """Custom tool for specialized code analysis.""" name = "custom_analysis" description = "Performs specialized code analysis" - + def _run(self, query: str) -> str: # Custom analysis logic return results @@ -7817,7 +7817,7 @@ def calculate_maintainability_index( ## Line Metrics -Line metrics provide insights into the size, complexity, and maintainability of a codebase. These measurements help determine the scale of a project, identify areas that may need refactoring, and track the growth of the codebase over time. +Line metrics provide insights into the size, complexity, and maintainability of a codebase. These measurements help determine the scale of a project, identify areas that may need refactoring, and track the growth of the codebase over time. ### Lines of Code Lines of Code refers to the total number of lines in the source code, including blank lines and comments. This is accomplished with a simple count of all lines in the source file. @@ -8114,7 +8114,7 @@ from codegen import Codebase # Initialize codebase codebase = Codebase("path/to/posthog/") -# Create a directed graph for representing call relationships +# Create a directed graph for representing call relationships G = nx.DiGraph() # Configuration flags @@ -8136,7 +8136,7 @@ We'll create a function that will recursively traverse the call trace of a funct ```python def create_downstream_call_trace(src_func: Function, depth: int = 0): """Creates call graph by recursively traversing function calls - + Args: src_func (Function): Starting function for call graph depth (int): Current recursion depth @@ -8144,7 +8144,7 @@ def create_downstream_call_trace(src_func: Function, depth: int = 0): # Prevent infinite recursion if MAX_DEPTH <= depth: return - + # External modules are not functions if isinstance(src_func, ExternalModule): return @@ -8154,12 +8154,12 @@ def create_downstream_call_trace(src_func: Function, depth: int = 0): # Skip self-recursive calls if call.name == src_func.name: continue - + # Get called function definition func = call.function_definition if not func: continue - + # Apply configured filters if isinstance(func, ExternalModule) and IGNORE_EXTERNAL_MODULE_CALLS: continue @@ -8173,7 +8173,7 @@ def create_downstream_call_trace(src_func: Function, depth: int = 0): func_name = f"{func.parent_class.name}.{func.name}" if func.is_method else func.name # Add node and edge with metadata - G.add_node(func, name=func_name, + G.add_node(func, name=func_name, color=COLOR_PALETTE.get(func.__class__.__name__)) G.add_edge(src_func, func, **generate_edge_meta(call)) @@ -8188,10 +8188,10 @@ We can enrich our edges with metadata about the function calls: ```python def generate_edge_meta(call: FunctionCall) -> dict: """Generate metadata for call graph edges - + Args: call (FunctionCall): Function call information - + Returns: dict: Edge metadata including name and location """ @@ -8210,8 +8210,8 @@ Finally, we can visualize our call graph starting from a specific function: target_class = codebase.get_class('SharingConfigurationViewSet') target_method = target_class.get_method('patch') -# Add root node -G.add_node(target_method, +# Add root node +G.add_node(target_method, name=f"{target_class.name}.{target_method.name}", color=COLOR_PALETTE["StartFunction"]) @@ -8261,7 +8261,7 @@ The core function for building our dependency graph: ```python def create_dependencies_visualization(symbol: Symbol, depth: int = 0): """Creates visualization of symbol dependencies - + Args: symbol (Symbol): Starting symbol to analyze depth (int): Current recursion depth @@ -8269,11 +8269,11 @@ def create_dependencies_visualization(symbol: Symbol, depth: int = 0): # Prevent excessive recursion if depth >= MAX_DEPTH: return - + # Process each dependency for dep in symbol.dependencies: dep_symbol = None - + # Handle different dependency types if isinstance(dep, Symbol): # Direct symbol reference @@ -8284,13 +8284,13 @@ def create_dependencies_visualization(symbol: Symbol, depth: int = 0): if dep_symbol: # Add node with appropriate styling - G.add_node(dep_symbol, - color=COLOR_PALETTE.get(dep_symbol.__class__.__name__, + G.add_node(dep_symbol, + color=COLOR_PALETTE.get(dep_symbol.__class__.__name__, "#f694ff")) - + # Add dependency relationship G.add_edge(symbol, dep_symbol) - + # Recurse unless it's a class (avoid complexity) if not isinstance(dep_symbol, PyClass): create_dependencies_visualization(dep_symbol, depth + 1) @@ -8302,7 +8302,7 @@ Finally, we can visualize our dependency graph starting from a specific symbol: # Get target symbol target_func = codebase.get_function("get_query_runner") -# Add root node +# Add root node G.add_node(target_func, color=COLOR_PALETTE["StartFunction"]) # Generate dependency graph @@ -8345,16 +8345,16 @@ HTTP_METHODS = ["get", "put", "patch", "post", "head", "delete"] def generate_edge_meta(usage: Usage) -> dict: """Generate metadata for graph edges - + Args: usage (Usage): Usage relationship information - + Returns: dict: Edge metadata including name and location """ return { "name": usage.match.source, - "file_path": usage.match.filepath, + "file_path": usage.match.filepath, "start_point": usage.match.start_point, "end_point": usage.match.end_point, "symbol_name": usage.match.__class__.__name__ @@ -8362,10 +8362,10 @@ def generate_edge_meta(usage: Usage) -> dict: def is_http_method(symbol: PySymbol) -> bool: """Check if a symbol is an HTTP endpoint method - + Args: symbol (PySymbol): Symbol to check - + Returns: bool: True if symbol is an HTTP method """ @@ -8379,7 +8379,7 @@ The main function for creating our blast radius visualization: ```python def create_blast_radius_visualization(symbol: PySymbol, depth: int = 0): """Create visualization of symbol usage relationships - + Args: symbol (PySymbol): Starting symbol to analyze depth (int): Current recursion depth @@ -8387,11 +8387,11 @@ def create_blast_radius_visualization(symbol: PySymbol, depth: int = 0): # Prevent excessive recursion if depth >= MAX_DEPTH: return - + # Process each usage of the symbol for usage in symbol.usages: usage_symbol = usage.usage_symbol - + # Determine node color based on type if is_http_method(usage_symbol): color = COLOR_PALETTE.get("HTTP_METHOD") @@ -8401,7 +8401,7 @@ def create_blast_radius_visualization(symbol: PySymbol, depth: int = 0): # Add node and edge to graph G.add_node(usage_symbol, color=color) G.add_edge(symbol, usage_symbol, **generate_edge_meta(usage)) - + # Recursively process usage symbol create_blast_radius_visualization(usage_symbol, depth + 1) ``` @@ -8552,7 +8552,7 @@ for call in old_api.call_sites: f"data={call.get_arg_by_parameter_name('input').value}", f"timeout={call.get_arg_by_parameter_name('wait').value}" ] - + # Replace the old call with the new API call.replace(f"new_process_data({', '.join(args)})") ``` @@ -8566,10 +8566,10 @@ When updating chained method calls, like database queries or builder patterns: for execute_call in codebase.function_calls: if execute_call.name != "execute": continue - + # Get the full chain chain = execute_call.call_chain - + # Example: Add .timeout() before .execute() if "timeout" not in {call.name for call in chain}: execute_call.insert_before("timeout(30)") @@ -8588,45 +8588,45 @@ Here's a comprehensive example: ```python def migrate_api_v1_to_v2(codebase): old_api = codebase.get_function("create_user_v1") - + # Document all existing call patterns call_patterns = {} for call in old_api.call_sites: args = [arg.source for arg in call.args] pattern = ", ".join(args) call_patterns[pattern] = call_patterns.get(pattern, 0) + 1 - + print("Found call patterns:") for pattern, count in call_patterns.items(): print(f" {pattern}: {count} occurrences") - + # Create new API version new_api = old_api.copy() new_api.rename("create_user_v2") - + # Update parameter types new_api.get_parameter("email").type = "EmailStr" new_api.get_parameter("role").type = "UserRole" - + # Add new required parameters new_api.add_parameter("tenant_id: UUID") - + # Update all call sites for call in old_api.call_sites: # Get current arguments email_arg = call.get_arg_by_parameter_name("email") role_arg = call.get_arg_by_parameter_name("role") - + # Build new argument list with type conversions new_args = [ f"email=EmailStr({email_arg.value})", f"role=UserRole({role_arg.value})", "tenant_id=get_current_tenant_id()" ] - + # Replace old call with new version call.replace(f"create_user_v2({', '.join(new_args)})") - + # Add deprecation notice to old version old_api.add_decorator('@deprecated("Use create_user_v2 instead")') @@ -8648,10 +8648,10 @@ migrate_api_v1_to_v2(codebase) ```python # First update parameter names param.rename("new_name") - + # Then update types param.type = "new_type" - + # Finally update call sites for call in api.call_sites: # ... update calls @@ -8661,7 +8661,7 @@ migrate_api_v1_to_v2(codebase) ```python # Add new parameter with default api.add_parameter("new_param: str = None") - + # Later make it required api.get_parameter("new_param").remove_default() ``` @@ -8676,7 +8676,7 @@ migrate_api_v1_to_v2(codebase) Remember to test thoroughly after making bulk changes to APIs. While Codegen ensures syntactic correctness, you'll want to verify the semantic correctness of the changes. - + --- title: "Organizing Your Codebase" @@ -9240,16 +9240,16 @@ from collections import defaultdict # Create a graph of file dependencies def create_dependency_graph(): G = nx.DiGraph() - + for file in codebase.files: # Add node for this file G.add_node(file.filepath) - + # Add edges for each import for imp in file.imports: if imp.from_file: # Skip external imports G.add_edge(file.filepath, imp.from_file.filepath) - + return G # Create and analyze the graph @@ -9278,18 +9278,18 @@ def break_circular_dependency(cycle): # Get the first two files in the cycle file1 = codebase.get_file(cycle[0]) file2 = codebase.get_file(cycle[1]) - + # Create a shared module for common code shared_dir = "shared" if not codebase.has_directory(shared_dir): codebase.create_directory(shared_dir) - + # Find symbols used by both files shared_symbols = [] for symbol in file1.symbols: if any(usage.file == file2 for usage in symbol.usages): shared_symbols.append(symbol) - + # Move shared symbols to a new file if shared_symbols: shared_file = codebase.create_file(f"{shared_dir}/shared_types.py") @@ -9311,7 +9311,7 @@ def organize_file_imports(file): std_lib_imports = [] third_party_imports = [] local_imports = [] - + for imp in file.imports: if imp.is_standard_library: std_lib_imports.append(imp) @@ -9319,26 +9319,26 @@ def organize_file_imports(file): third_party_imports.append(imp) else: local_imports.append(imp) - + # Sort each group for group in [std_lib_imports, third_party_imports, local_imports]: group.sort(key=lambda x: x.module_name) - + # Remove all existing imports for imp in file.imports: imp.remove() - + # Add imports back in organized groups if std_lib_imports: for imp in std_lib_imports: file.add_import(imp.source) file.insert_after_imports("") # Add newline - + if third_party_imports: for imp in third_party_imports: file.add_import(imp.source) file.insert_after_imports("") # Add newline - + if local_imports: for imp in local_imports: file.add_import(imp.source) @@ -9357,22 +9357,22 @@ from collections import defaultdict def analyze_module_coupling(): coupling_scores = defaultdict(int) - + for file in codebase.files: # Count unique files imported from imported_files = {imp.from_file for imp in file.imports if imp.from_file} coupling_scores[file.filepath] = len(imported_files) - + # Count files that import this file - importing_files = {usage.file for symbol in file.symbols + importing_files = {usage.file for symbol in file.symbols for usage in symbol.usages if usage.file != file} coupling_scores[file.filepath] += len(importing_files) - + # Sort by coupling score - sorted_files = sorted(coupling_scores.items(), - key=lambda x: x[1], + sorted_files = sorted(coupling_scores.items(), + key=lambda x: x[1], reverse=True) - + print("\n🔍 Module Coupling Analysis:") print("\nMost coupled files:") for filepath, score in sorted_files[:5]: @@ -9390,9 +9390,9 @@ def extract_shared_code(file, min_usages=3): # Find symbols used by multiple files for symbol in file.symbols: # Get unique files using this symbol - using_files = {usage.file for usage in symbol.usages + using_files = {usage.file for usage in symbol.usages if usage.file != file} - + if len(using_files) >= min_usages: # Create appropriate shared module module_name = determine_shared_module(symbol) @@ -9400,7 +9400,7 @@ def extract_shared_code(file, min_usages=3): shared_file = codebase.create_file(f"shared/{module_name}.py") else: shared_file = codebase.get_file(f"shared/{module_name}.py") - + # Move symbol to shared module symbol.move_to_file(shared_file, strategy="update_all_imports") @@ -9454,7 +9454,7 @@ if feature_flag_class: # Initialize usage count for all attributes for attr in feature_flag_class.attributes: feature_flag_usage[attr.name] = 0 - + # Get all usages of the FeatureFlag class for usage in feature_flag_class.usages: usage_source = usage.usage_symbol.source if hasattr(usage, 'usage_symbol') else str(usage) @@ -10199,7 +10199,7 @@ Let's break down how this works: if export.is_reexport() and export.is_default_export(): print(f" 🔄 Converting default export '{export.name}'") ``` - + The code identifies default exports by checking: 1. If it's a re-export (`is_reexport()`) 2. If it's a default export (`is_default_export()`) @@ -10307,7 +10307,7 @@ for file in codebase.files: print(f"✨ Fixed exports in {target_file.filepath}") -``` +``` --- title: "Creating Documentation" @@ -10396,11 +10396,11 @@ for directory in codebase.directories: # Skip test, sql and alembic directories if any(x in directory.path.lower() for x in ['test', 'sql', 'alembic']): continue - + # Get undecorated functions funcs = [f for f in directory.functions if not f.is_decorated] total = len(funcs) - + # Only analyze dirs with >10 functions if total > 10: documented = sum(1 for f in funcs if f.docstring) @@ -10415,12 +10415,12 @@ for directory in codebase.directories: if dir_stats: lowest_dir = min(dir_stats.items(), key=lambda x: x[1]['coverage']) path, stats = lowest_dir - + print(f"📉 Lowest coverage directory: '{path}'") print(f" • Total functions: {stats['total']}") print(f" • Documented: {stats['documented']}") print(f" • Coverage: {stats['coverage']:.1f}%") - + # Print all directory stats for comparison print("\n📊 All directory coverage rates:") for path, stats in sorted(dir_stats.items(), key=lambda x: x[1]['coverage']): @@ -11208,7 +11208,7 @@ iconType: "solid" -Import loops occur when two or more Python modules depend on each other, creating a circular dependency. While some import cycles can be harmless, others can lead to runtime errors and make code harder to maintain. +Import loops occur when two or more Python modules depend on each other, creating a circular dependency. While some import cycles can be harmless, others can lead to runtime errors and make code harder to maintain. In this tutorial, we'll explore how to identify and fix problematic import cycles using Codegen. @@ -12105,7 +12105,7 @@ Match (s: Func )-[r: CALLS]-> (e:Func) RETURN s, e LIMIT 10 ```cypher Match path = (:(Method|Func)) -[:CALLS*5..10]-> (:(Method|Func)) -Return path +Return path LIMIT 20 ``` @@ -12123,8 +12123,8 @@ iconType: "solid" # AI Impact Analysis -This tutorial shows how to use Codegen's attribution extension to analyze the impact of AI on your -codebase. You'll learn how to identify which parts of your code were written by AI tools like +This tutorial shows how to use Codegen's attribution extension to analyze the impact of AI on your +codebase. You'll learn how to identify which parts of your code were written by AI tools like GitHub Copilot, Devin, or other AI assistants. Note: the code is flexible - you can track CI pipeline bots, or any other contributor you want. @@ -12203,7 +12203,7 @@ for symbol in codebase.symbols: ### Customizing AI Author Detection -By default, the analysis looks for common AI bot names in commit authors. +By default, the analysis looks for common AI bot names in commit authors. You can customize this by providing your own list of AI authors: ```python @@ -12244,34 +12244,34 @@ from codegen.shared.enums.programming_language import ProgrammingLanguage def analyze_contributors(codebase): """Analyze contributors to the codebase and their impact.""" print("\n🔍 Contributor Analysis:") - + # Define which authors are considered AI ai_authors = ['devin[bot]', 'codegen[bot]', 'github-actions[bot]', 'dependabot[bot]'] - + # Add attribution information to all symbols print("Adding attribution information to symbols...") add_attribution_to_symbols(codebase, ai_authors) - + # Collect statistics about contributors contributor_stats = Counter() ai_contributor_stats = Counter() - + print("Analyzing symbol attributions...") for symbol in codebase.symbols: if hasattr(symbol, 'last_editor') and symbol.last_editor: contributor_stats[symbol.last_editor] += 1 - + # Track if this is an AI contributor if any(ai in symbol.last_editor for ai in ai_authors): ai_contributor_stats[symbol.last_editor] += 1 - + # Print top contributors overall print("\n👥 Top Contributors by Symbols Authored:") for contributor, count in contributor_stats.most_common(10): is_ai = any(ai in contributor for ai in ai_authors) ai_indicator = "🤖" if is_ai else "👤" print(f" {ai_indicator} {contributor}: {count} symbols") - + # Print top AI contributors if any if ai_contributor_stats: print("\n🤖 Top AI Contributors:") @@ -12283,24 +12283,23 @@ if os.path.exists(".git"): repo_path = os.getcwd() repo_config = RepoConfig.from_repo_path(repo_path) repo_operator = RepoOperator(repo_config=repo_config) - + project = ProjectConfig.from_repo_operator( repo_operator=repo_operator, programming_language=ProgrammingLanguage.PYTHON ) codebase = Codebase(projects=[project]) - + # Run the contributor analysis analyze_contributors(codebase) ``` ## Conclusion -The attribution extension provides valuable insights into how AI tools are being used in your +The attribution extension provides valuable insights into how AI tools are being used in your development process. By understanding which parts of your codebase are authored by AI, you can: - Track the adoption of AI coding assistants in your team - Identify areas where AI is most effective - Ensure appropriate review of AI-generated code - Measure the impact of AI on developer productivity -