diff --git a/docs/building-with-codegen/parsing-codebases.mdx b/docs/building-with-codegen/parsing-codebases.mdx index f1112ca0f..61abd3778 100644 --- a/docs/building-with-codegen/parsing-codebases.mdx +++ b/docs/building-with-codegen/parsing-codebases.mdx @@ -9,21 +9,29 @@ The primary entrypoint to programs leveraging Codegen is the [Codebase](/api-ref ## Local Codebases -Construct a Codebase by passing in a path to a local `git` repository. +Construct a Codebase by passing in a path to a local `git` repository or any subfolder within it. The path must be within a git repository (i.e., somewhere in the parent directory tree must contain a `.git` folder). ```python from codegen import Codebase +from codegen.sdk.enums import ProgrammingLanguage -# Parse from a local directory +# Parse from a git repository root codebase = Codebase("path/to/repository") -# Parse from current directory +# Parse from a subfolder within a git repository +codebase = Codebase("path/to/repository/src/subfolder") + +# Parse from current directory (must be within a git repo) codebase = Codebase("./") + +# Specify programming language (instead of inferring from file extensions) +codebase = Codebase("./", programming_language=ProgrammingLanguage.TYPESCRIPT) ``` - This will automatically infer the programming language of the codebase and - parse all files in the codebase. + By default, Codegen will automatically infer the programming language of the codebase and + parse all files in the codebase. You can override this by passing the `programming_language` parameter + with a value from the `ProgrammingLanguage` enum. @@ -38,16 +46,18 @@ To fetch and parse a repository directly from GitHub, use the `from_repo` functi ```python import codegen +from codegen.sdk.enums import ProgrammingLanguage # Fetch and parse a repository (defaults to /tmp/codegen/{repo_name}) codebase = codegen.from_repo('fastapi/fastapi') -# Customize temp directory, clone depth, or specific commit +# Customize temp directory, clone depth, specific commit, or programming language codebase = codegen.from_repo( 'fastapi/fastapi', tmp_dir='/custom/temp/dir', # Optional: custom temp directory - commit='786a8ada7ed0c7f9d8b04d49f24596865e4b7901', + commit='786a8ada7ed0c7f9d8b04d49f24596865e4b7901', # Optional: specific commit shallow=False, # Optional: full clone instead of shallow + programming_language=ProgrammingLanguage.PYTHON # Optional: override language detection ) ``` @@ -56,6 +66,69 @@ codebase = codegen.from_repo( default. The clone is shallow by default for better performance. +## Configuration Options + +You can customize the behavior of your Codebase instance by passing a `CodebaseConfig` object. This allows you to configure secrets (like API keys) and toggle specific features: + +```python +from codegen import Codebase +from codegen.sdk.codebase.config import CodebaseConfig, GSFeatureFlags, Secrets + +codebase = Codebase( + "path/to/repository", + config=CodebaseConfig( + secrets=Secrets( + openai_key="your-openai-key" # For AI-powered features + ), + feature_flags=GSFeatureFlags( + sync_enabled=True, # Enable graph synchronization + ... # Add other feature flags as needed + ) + ) +) +``` + +The `CodebaseConfig` allows you to configure: +- `secrets`: API keys and other sensitive information needed by the codebase +- `feature_flags`: Toggle specific features like language engines, dependency management, and graph synchronization + +For a complete list of available feature flags and configuration options, see the [source code on GitHub](https://github.com/codegen-sh/codegen-sdk/blob/develop/src/codegen/sdk/codebase/config.py). + +## Advanced Initialization + +For more complex scenarios, Codegen supports an advanced initialization mode using `ProjectConfig`. This allows for fine-grained control over: + +- Repository configuration +- Base path and subdirectory filtering +- Multiple project configurations + +Here's an example: + +```python +from codegen import Codebase +from codegen.git.repo_operator.local_repo_operator import LocalRepoOperator +from codegen.git.schemas.repo_config import BaseRepoConfig +from codegen.sdk.codebase.config import ProjectConfig +from codegen.sdk.enums import ProgrammingLanguage + +codebase = Codebase( + projects = [ + ProjectConfig( + repo_operator=LocalRepoOperator( + repo_path="/tmp/codegen-sdk", + repo_config=BaseRepoConfig(), + bot_commit=True + ), + programming_language=ProgrammingLanguage.TYPESCRIPT, + base_path="src/codegen/sdk/typescript", + subdirectories=["src/codegen/sdk/typescript"] + ) + ] +) +``` + +For more details on advanced configuration options, see the [source code on GitHub](https://github.com/codegen-sh/codegen-sdk/blob/develop/src/codegen/sdk/core/codebase.py). + ## Supported Languages Codegen currently supports: diff --git a/src/codegen/git/repo_operator/local_repo_operator.py b/src/codegen/git/repo_operator/local_repo_operator.py index 151a2cace..5de559392 100644 --- a/src/codegen/git/repo_operator/local_repo_operator.py +++ b/src/codegen/git/repo_operator/local_repo_operator.py @@ -32,14 +32,15 @@ class LocalRepoOperator(RepoOperator): def __init__( self, - repo_config: BaseRepoConfig, repo_path: str, # full path to the repo + repo_config: BaseRepoConfig | None = None, bot_commit: bool = True, ) -> None: self._repo_path = repo_path self._repo_name = os.path.basename(repo_path) os.makedirs(self.repo_path, exist_ok=True) GitCLI.init(self.repo_path) + repo_config = repo_config or BaseRepoConfig() super().__init__(repo_config, self.repo_path, bot_commit) #################################################################################################################### diff --git a/src/codegen/sdk/codebase/config.py b/src/codegen/sdk/codebase/config.py index 01c429113..10fbddd4f 100644 --- a/src/codegen/sdk/codebase/config.py +++ b/src/codegen/sdk/codebase/config.py @@ -1,8 +1,13 @@ +import os +from typing import Self + from pydantic import BaseModel, ConfigDict, Field +from codegen.git.repo_operator.local_repo_operator import LocalRepoOperator from codegen.git.repo_operator.repo_operator import RepoOperator from codegen.sdk.enums import ProgrammingLanguage from codegen.sdk.secrets import Secrets +from codegen.sdk.utils import determine_project_language, split_git_path HARD_MAX_AI_LIMIT = 500 # Global limit for AI requests @@ -55,6 +60,28 @@ class ProjectConfig(BaseModel): subdirectories: list[str] | None = None programming_language: ProgrammingLanguage = ProgrammingLanguage.PYTHON + @classmethod + def from_path(cls, path: str, programming_language: ProgrammingLanguage | None = None) -> Self: + # Split repo_path into (git_root, base_path) + repo_path = os.path.abspath(path) + git_root, base_path = split_git_path(repo_path) + # Create main project + return cls( + repo_operator=LocalRepoOperator(repo_path=git_root), + programming_language=programming_language or determine_project_language(repo_path), + base_path=base_path, + subdirectories=[base_path] if base_path else None, + ) + + @classmethod + def from_repo_operator(cls, repo_operator: RepoOperator, programming_language: ProgrammingLanguage | None = None, base_path: str | None = None) -> Self: + return cls( + repo_operator=repo_operator, + programming_language=programming_language or determine_project_language(repo_operator.repo_path), + base_path=base_path, + subdirectories=[base_path] if base_path else None, + ) + class CodebaseConfig(BaseModel): """Configuration for a Codebase. There can be 1 -> many codebases in a single repo diff --git a/src/codegen/sdk/core/codebase.py b/src/codegen/sdk/core/codebase.py index 683049959..2548ccf41 100644 --- a/src/codegen/sdk/core/codebase.py +++ b/src/codegen/sdk/core/codebase.py @@ -23,7 +23,6 @@ from codegen.git.repo_operator.remote_repo_operator import RemoteRepoOperator from codegen.git.repo_operator.repo_operator import RepoOperator from codegen.git.schemas.enums import CheckoutResult -from codegen.git.schemas.repo_config import BaseRepoConfig from codegen.sdk._proxy import proxy_property from codegen.sdk.ai.helpers import AbstractAIHelper, MultiProviderAIHelper from codegen.sdk.codebase.codebase_ai import generate_system_prompt, generate_tools @@ -74,7 +73,6 @@ from codegen.sdk.typescript.statements.import_statement import TSImportStatement from codegen.sdk.typescript.symbol import TSSymbol from codegen.sdk.typescript.type_alias import TSTypeAlias -from codegen.sdk.utils import determine_project_language, split_git_path from codegen.shared.decorators.docs import apidoc, noapidoc, py_noapidoc from codegen.shared.exceptions.control_flow import MaxAIRequestsError from codegen.shared.performance.stopwatch_utils import stopwatch @@ -119,7 +117,8 @@ def __init__( self, repo_path: None = None, *, - projects: list[ProjectConfig], + programming_language: None = None, + projects: list[ProjectConfig] | ProjectConfig, config: CodebaseConfig = DefaultConfig, ) -> None: ... @@ -128,6 +127,7 @@ def __init__( self, repo_path: str, *, + programming_language: ProgrammingLanguage, projects: None = None, config: CodebaseConfig = DefaultConfig, ) -> None: ... @@ -136,7 +136,8 @@ def __init__( self, repo_path: str | None = None, *, - projects: list[ProjectConfig] | None = None, + programming_language: ProgrammingLanguage | None = None, + projects: list[ProjectConfig] | ProjectConfig | None = None, config: CodebaseConfig = DefaultConfig, ) -> None: # Sanity check inputs @@ -146,19 +147,16 @@ def __init__( if repo_path is None and projects is None: raise ValueError("Must specify either repo_path or projects") + if projects is not None and programming_language is not None: + raise ValueError("Cannot specify both projects and programming_language. Use ProjectConfig.from_path() to create projects with a custom programming_language.") + + # If projects is a single ProjectConfig, convert it to a list + if isinstance(projects, ProjectConfig): + projects = [projects] + # Initialize project with repo_path if projects is None if repo_path is not None: - # Split repo_path into (git_root, base_path) - repo_path = os.path.abspath(repo_path) - git_root, base_path = split_git_path(repo_path) - # Create repo_config - repo_config = BaseRepoConfig() - # Create main project - main_project = ProjectConfig( - repo_operator=LocalRepoOperator(repo_config=repo_config, repo_path=git_root), - programming_language=determine_project_language(repo_path), - base_path=base_path, - ) + main_project = ProjectConfig.from_path(repo_path, programming_language=programming_language) projects = [main_project] else: main_project = projects[0] @@ -1125,7 +1123,7 @@ def set_session_options(self, **kwargs: Unpack[SessionOptions]) -> None: self.G.transaction_manager.reset_stopwatch(self.G.session_options.max_seconds) @classmethod - def from_repo(cls, repo_name: str, *, tmp_dir: str | None = None, commit: str | None = None, shallow: bool = True) -> "Codebase": + def from_repo(cls, repo_name: str, *, tmp_dir: str | None = None, commit: str | None = None, shallow: bool = True, programming_language: ProgrammingLanguage | None = None) -> "Codebase": """Fetches a codebase from GitHub and returns a Codebase instance. Args: @@ -1133,6 +1131,8 @@ def from_repo(cls, repo_name: str, *, tmp_dir: str | None = None, commit: str | tmp_dir (Optional[str]): The directory to clone the repo into. Defaults to /tmp/codegen commit (Optional[str]): The specific commit hash to clone. Defaults to HEAD shallow (bool): Whether to do a shallow clone. Defaults to True + programming_language (ProgrammingLanguage | None): The programming language of the repo. Defaults to None. + Returns: Codebase: A Codebase instance initialized with the cloned repository """ @@ -1163,7 +1163,6 @@ def from_repo(cls, repo_name: str, *, tmp_dir: str | None = None, commit: str | # Ensure the operator can handle remote operations repo_operator = LocalRepoOperator.create_from_commit( repo_path=repo_path, - default_branch="main", # We'll get the actual default branch after clone commit=commit, url=repo_url, ) @@ -1171,7 +1170,7 @@ def from_repo(cls, repo_name: str, *, tmp_dir: str | None = None, commit: str | # Initialize and return codebase with proper context logger.info("Initializing Codebase...") - project = ProjectConfig(repo_operator=repo_operator, programming_language=determine_project_language(repo_path)) + project = ProjectConfig.from_repo_operator(repo_operator=repo_operator, programming_language=programming_language) codebase = Codebase(projects=[project], config=DefaultConfig) logger.info("Codebase initialization complete") return codebase