diff --git a/docs/building-with-codegen/parsing-codebases.mdx b/docs/building-with-codegen/parsing-codebases.mdx index e21a7e017..446f7dcbe 100644 --- a/docs/building-with-codegen/parsing-codebases.mdx +++ b/docs/building-with-codegen/parsing-codebases.mdx @@ -34,20 +34,20 @@ codebase = Codebase("./") ## Remote Repositories -To fetch and parse a repository directly from GitHub, use the `fetch_codebase` function. +To fetch and parse a repository directly from GitHub, use the `from_repo` function. ```python import codegen # Fetch and parse a repository (defaults to /tmp/codegen/{repo_name}) -codebase = codegen.fetch_codebase('fastapi/fastapi') +codebase = codegen.from_repo('fastapi/fastapi') # Customize temp directory, clone depth, or specific commit -codebase = codegen.fetch_codebase( +codebase = codegen.from_repo( 'fastapi/fastapi', tmp_dir='/custom/temp/dir', # Optional: custom temp directory + commit='786a8ada7ed0c7f9d8b04d49f24596865e4b7901', shallow=False, # Optional: full clone instead of shallow - commit_hash='fe513719ea98abade167d8a89e92f600d9d8f0e5' # Optional: specific commit ) ``` diff --git a/src/codegen/git/repo_operator/local_repo_operator.py b/src/codegen/git/repo_operator/local_repo_operator.py index 57f287862..d739e1252 100644 --- a/src/codegen/git/repo_operator/local_repo_operator.py +++ b/src/codegen/git/repo_operator/local_repo_operator.py @@ -82,6 +82,49 @@ def create_from_commit(cls, repo_path: str, default_branch: str, commit: str, ur op.checkout_commit(commit) return op + @classmethod + def create_from_repo(cls, repo_path: str, url: str) -> Self: + """Create a fresh clone of a repository or use existing one if up to date. + + Args: + repo_path (str): Path where the repo should be cloned + url (str): Git URL of the repository + """ + # Check if repo already exists + if os.path.exists(repo_path): + try: + # Try to initialize git repo from existing path + git_cli = GitCLI(repo_path) + # Check if it has our remote URL + if any(remote.url == url for remote in git_cli.remotes): + # Fetch to check for updates + git_cli.remotes.origin.fetch() + # Get current and remote HEADs + local_head = git_cli.head.commit + remote_head = git_cli.remotes.origin.refs[git_cli.active_branch.name].commit + # If up to date, use existing repo + if local_head.hexsha == remote_head.hexsha: + default_branch = git_cli.active_branch.name + return cls(repo_config=BaseRepoConfig(), repo_path=repo_path, default_branch=default_branch, bot_commit=False) + except Exception: + # If any git operations fail, fallback to fresh clone + pass + + # If we get here, repo exists but is not up to date or valid + # Remove the existing directory to do a fresh clone + import shutil + + shutil.rmtree(repo_path) + + # Do a fresh clone with depth=1 to get latest commit + GitCLI.clone_from(url=url, to_path=repo_path, depth=1) + + # Initialize with the cloned repo + git_cli = GitCLI(repo_path) + default_branch = git_cli.active_branch.name + + return cls(repo_config=BaseRepoConfig(), repo_path=repo_path, default_branch=default_branch, bot_commit=False) + #################################################################################################################### # PROPERTIES #################################################################################################################### diff --git a/src/codegen/sdk/core/codebase.py b/src/codegen/sdk/core/codebase.py index 2f2a0d727..7805fa36f 100644 --- a/src/codegen/sdk/core/codebase.py +++ b/src/codegen/sdk/core/codebase.py @@ -1085,6 +1085,61 @@ def set_session_options(self, **kwargs: Unpack[SessionOptions]) -> None: self.G.transaction_manager.set_max_transactions(self.G.session_options.max_transactions) self.G.transaction_manager.reset_stopwatch(self.G.session_options.max_seconds) + @classmethod + def from_repo(cls, repo_name: str, *, tmp_dir: str | None = None, commit: str | None = None, shallow: bool = True) -> "Codebase": + """Fetches a codebase from GitHub and returns a Codebase instance. + + Args: + repo_name (str): The name of the repository in format "owner/repo" + tmp_dir (Optional[str]): The directory to clone the repo into. Defaults to /tmp/codegen + commit (Optional[str]): The specific commit hash to clone. Defaults to HEAD + shallow (bool): Whether to do a shallow clone. Defaults to True + Returns: + Codebase: A Codebase instance initialized with the cloned repository + """ + logger.info(f"Fetching codebase for {repo_name}") + + # Parse repo name + if "/" not in repo_name: + raise ValueError("repo_name must be in format 'owner/repo'") + owner, repo = repo_name.split("/") + + # Setup temp directory + if tmp_dir is None: + tmp_dir = "/tmp/codegen" + os.makedirs(tmp_dir, exist_ok=True) + logger.info(f"Using directory: {tmp_dir}") + + # Setup repo path and URL + repo_path = os.path.join(tmp_dir, repo) + repo_url = f"https://github.com/{repo_name}.git" + logger.info(f"Will clone {repo_url} to {repo_path}") + + try: + # Use LocalRepoOperator to fetch the repository + logger.info("Cloning repository...") + if commit is None: + repo_operator = LocalRepoOperator.create_from_repo(repo_path=repo_path, url=repo_url) + else: + # Ensure the operator can handle remote operations + repo_operator = LocalRepoOperator.create_from_commit( + repo_path=repo_path, + default_branch="main", # We'll get the actual default branch after clone + commit=commit, + url=repo_url, + ) + logger.info("Clone completed successfully") + + # Initialize and return codebase with proper context + logger.info("Initializing Codebase...") + project = ProjectConfig(repo_operator=repo_operator, programming_language=determine_project_language(repo_path)) + codebase = Codebase(projects=[project], config=DefaultConfig) + logger.info("Codebase initialization complete") + return codebase + except Exception as e: + logger.error(f"Failed to initialize codebase: {e}") + raise + # The last 2 lines of code are added to the runner. See codegen-backend/cli/generate/utils.py # Type Aliases diff --git a/src/codegen/sdk/fetch_codebase.py b/src/codegen/sdk/fetch_codebase.py deleted file mode 100644 index 4fd4ce009..000000000 --- a/src/codegen/sdk/fetch_codebase.py +++ /dev/null @@ -1,75 +0,0 @@ -import logging -import os - -from codegen.git.repo_operator.local_repo_operator import LocalRepoOperator -from codegen.sdk.codebase.config import DefaultConfig, ProjectConfig -from codegen.sdk.core.codebase import Codebase -from codegen.sdk.utils import determine_project_language - -logger = logging.getLogger(__name__) - -DEFAULT_CODEGEN_DIR = "/tmp/codegen" - - -def fetch_codebase(repo_name: str, *, tmp_dir: str | None = None, shallow: bool = True, commit_hash: str | None = None) -> Codebase: - """Fetches a codebase from GitHub and returns a Codebase instance. - - Args: - repo_name (str): The name of the repository in format "owner/repo" - tmp_dir (Optional[str]): The directory to clone the repo into. Defaults to /tmp/codegen - shallow (bool): Whether to do a shallow clone. Defaults to True - commit_hash (Optional[str]): The specific commit hash to clone. Defaults to HEAD - Returns: - Codebase: A Codebase instance initialized with the cloned repository - Example: - ```python - import codegen.sdk as sdk - import logging - # Enable logging to see progress - logging.basicConfig(level=logging.INFO) - # Clone a repository to default location (/tmp/codegen) - codebase = sdk.fetch_codebase('facebook/react') - # Or specify a custom directory - codebase = sdk.fetch_codebase('facebook/react', tmp_dir='~/my_repos') - # Or clone a specific commit - codebase = sdk.fetch_codebase('facebook/react', commit_hash='abc123') - ``` - """ - logger.info(f"Fetching codebase for {repo_name}") - - # Parse repo name - if "/" not in repo_name: - raise ValueError("repo_name must be in format 'owner/repo'") - owner, repo = repo_name.split("/") - - # Setup temp directory - if tmp_dir is None: - tmp_dir = DEFAULT_CODEGEN_DIR - os.makedirs(tmp_dir, exist_ok=True) - logger.info(f"Using directory: {tmp_dir}") - - # Setup repo path and URL - repo_path = os.path.join(tmp_dir, repo) - repo_url = f"https://github.com/{repo_name}.git" - logger.info(f"Will clone {repo_url} to {repo_path}") - - try: - # Use LocalRepoOperator to fetch the repository - logger.info("Cloning repository...") - repo_operator = LocalRepoOperator.create_from_commit( - repo_path=repo_path, - default_branch="main", # We'll get the actual default branch after clone - commit=commit_hash or "HEAD", - url=repo_url, - ) - logger.info("Clone completed successfully") - - # Initialize and return codebase with proper context - logger.info("Initializing Codebase...") - project = ProjectConfig(repo_operator=repo_operator, programming_language=determine_project_language(repo_path)) - codebase = Codebase(projects=[project], config=DefaultConfig) - logger.info("Codebase initialization complete") - return codebase - except Exception as e: - logger.error(f"Failed to initialize codebase: {e}") - raise