diff --git a/docs/building-with-codegen/parsing-codebases.mdx b/docs/building-with-codegen/parsing-codebases.mdx
index f1112ca0f..61abd3778 100644
--- a/docs/building-with-codegen/parsing-codebases.mdx
+++ b/docs/building-with-codegen/parsing-codebases.mdx
@@ -9,21 +9,29 @@ The primary entrypoint to programs leveraging Codegen is the [Codebase](/api-ref
## Local Codebases
-Construct a Codebase by passing in a path to a local `git` repository.
+Construct a Codebase by passing in a path to a local `git` repository or any subfolder within it. The path must be within a git repository (i.e., somewhere in the parent directory tree must contain a `.git` folder).
```python
from codegen import Codebase
+from codegen.sdk.enums import ProgrammingLanguage
-# Parse from a local directory
+# Parse from a git repository root
codebase = Codebase("path/to/repository")
-# Parse from current directory
+# Parse from a subfolder within a git repository
+codebase = Codebase("path/to/repository/src/subfolder")
+
+# Parse from current directory (must be within a git repo)
codebase = Codebase("./")
+
+# Specify programming language (instead of inferring from file extensions)
+codebase = Codebase("./", programming_language=ProgrammingLanguage.TYPESCRIPT)
```
- This will automatically infer the programming language of the codebase and
- parse all files in the codebase.
+ By default, Codegen will automatically infer the programming language of the codebase and
+ parse all files in the codebase. You can override this by passing the `programming_language` parameter
+ with a value from the `ProgrammingLanguage` enum.
@@ -38,16 +46,18 @@ To fetch and parse a repository directly from GitHub, use the `from_repo` functi
```python
import codegen
+from codegen.sdk.enums import ProgrammingLanguage
# Fetch and parse a repository (defaults to /tmp/codegen/{repo_name})
codebase = codegen.from_repo('fastapi/fastapi')
-# Customize temp directory, clone depth, or specific commit
+# Customize temp directory, clone depth, specific commit, or programming language
codebase = codegen.from_repo(
'fastapi/fastapi',
tmp_dir='/custom/temp/dir', # Optional: custom temp directory
- commit='786a8ada7ed0c7f9d8b04d49f24596865e4b7901',
+ commit='786a8ada7ed0c7f9d8b04d49f24596865e4b7901', # Optional: specific commit
shallow=False, # Optional: full clone instead of shallow
+ programming_language=ProgrammingLanguage.PYTHON # Optional: override language detection
)
```
@@ -56,6 +66,69 @@ codebase = codegen.from_repo(
default. The clone is shallow by default for better performance.
+## Configuration Options
+
+You can customize the behavior of your Codebase instance by passing a `CodebaseConfig` object. This allows you to configure secrets (like API keys) and toggle specific features:
+
+```python
+from codegen import Codebase
+from codegen.sdk.codebase.config import CodebaseConfig, GSFeatureFlags, Secrets
+
+codebase = Codebase(
+ "path/to/repository",
+ config=CodebaseConfig(
+ secrets=Secrets(
+ openai_key="your-openai-key" # For AI-powered features
+ ),
+ feature_flags=GSFeatureFlags(
+ sync_enabled=True, # Enable graph synchronization
+ ... # Add other feature flags as needed
+ )
+ )
+)
+```
+
+The `CodebaseConfig` allows you to configure:
+- `secrets`: API keys and other sensitive information needed by the codebase
+- `feature_flags`: Toggle specific features like language engines, dependency management, and graph synchronization
+
+For a complete list of available feature flags and configuration options, see the [source code on GitHub](https://github.com/codegen-sh/codegen-sdk/blob/develop/src/codegen/sdk/codebase/config.py).
+
+## Advanced Initialization
+
+For more complex scenarios, Codegen supports an advanced initialization mode using `ProjectConfig`. This allows for fine-grained control over:
+
+- Repository configuration
+- Base path and subdirectory filtering
+- Multiple project configurations
+
+Here's an example:
+
+```python
+from codegen import Codebase
+from codegen.git.repo_operator.local_repo_operator import LocalRepoOperator
+from codegen.git.schemas.repo_config import BaseRepoConfig
+from codegen.sdk.codebase.config import ProjectConfig
+from codegen.sdk.enums import ProgrammingLanguage
+
+codebase = Codebase(
+ projects = [
+ ProjectConfig(
+ repo_operator=LocalRepoOperator(
+ repo_path="/tmp/codegen-sdk",
+ repo_config=BaseRepoConfig(),
+ bot_commit=True
+ ),
+ programming_language=ProgrammingLanguage.TYPESCRIPT,
+ base_path="src/codegen/sdk/typescript",
+ subdirectories=["src/codegen/sdk/typescript"]
+ )
+ ]
+)
+```
+
+For more details on advanced configuration options, see the [source code on GitHub](https://github.com/codegen-sh/codegen-sdk/blob/develop/src/codegen/sdk/core/codebase.py).
+
## Supported Languages
Codegen currently supports:
diff --git a/src/codegen/git/repo_operator/local_repo_operator.py b/src/codegen/git/repo_operator/local_repo_operator.py
index 151a2cace..5de559392 100644
--- a/src/codegen/git/repo_operator/local_repo_operator.py
+++ b/src/codegen/git/repo_operator/local_repo_operator.py
@@ -32,14 +32,15 @@ class LocalRepoOperator(RepoOperator):
def __init__(
self,
- repo_config: BaseRepoConfig,
repo_path: str, # full path to the repo
+ repo_config: BaseRepoConfig | None = None,
bot_commit: bool = True,
) -> None:
self._repo_path = repo_path
self._repo_name = os.path.basename(repo_path)
os.makedirs(self.repo_path, exist_ok=True)
GitCLI.init(self.repo_path)
+ repo_config = repo_config or BaseRepoConfig()
super().__init__(repo_config, self.repo_path, bot_commit)
####################################################################################################################
diff --git a/src/codegen/sdk/codebase/config.py b/src/codegen/sdk/codebase/config.py
index 01c429113..10fbddd4f 100644
--- a/src/codegen/sdk/codebase/config.py
+++ b/src/codegen/sdk/codebase/config.py
@@ -1,8 +1,13 @@
+import os
+from typing import Self
+
from pydantic import BaseModel, ConfigDict, Field
+from codegen.git.repo_operator.local_repo_operator import LocalRepoOperator
from codegen.git.repo_operator.repo_operator import RepoOperator
from codegen.sdk.enums import ProgrammingLanguage
from codegen.sdk.secrets import Secrets
+from codegen.sdk.utils import determine_project_language, split_git_path
HARD_MAX_AI_LIMIT = 500 # Global limit for AI requests
@@ -55,6 +60,28 @@ class ProjectConfig(BaseModel):
subdirectories: list[str] | None = None
programming_language: ProgrammingLanguage = ProgrammingLanguage.PYTHON
+ @classmethod
+ def from_path(cls, path: str, programming_language: ProgrammingLanguage | None = None) -> Self:
+ # Split repo_path into (git_root, base_path)
+ repo_path = os.path.abspath(path)
+ git_root, base_path = split_git_path(repo_path)
+ # Create main project
+ return cls(
+ repo_operator=LocalRepoOperator(repo_path=git_root),
+ programming_language=programming_language or determine_project_language(repo_path),
+ base_path=base_path,
+ subdirectories=[base_path] if base_path else None,
+ )
+
+ @classmethod
+ def from_repo_operator(cls, repo_operator: RepoOperator, programming_language: ProgrammingLanguage | None = None, base_path: str | None = None) -> Self:
+ return cls(
+ repo_operator=repo_operator,
+ programming_language=programming_language or determine_project_language(repo_operator.repo_path),
+ base_path=base_path,
+ subdirectories=[base_path] if base_path else None,
+ )
+
class CodebaseConfig(BaseModel):
"""Configuration for a Codebase. There can be 1 -> many codebases in a single repo
diff --git a/src/codegen/sdk/core/codebase.py b/src/codegen/sdk/core/codebase.py
index 683049959..2548ccf41 100644
--- a/src/codegen/sdk/core/codebase.py
+++ b/src/codegen/sdk/core/codebase.py
@@ -23,7 +23,6 @@
from codegen.git.repo_operator.remote_repo_operator import RemoteRepoOperator
from codegen.git.repo_operator.repo_operator import RepoOperator
from codegen.git.schemas.enums import CheckoutResult
-from codegen.git.schemas.repo_config import BaseRepoConfig
from codegen.sdk._proxy import proxy_property
from codegen.sdk.ai.helpers import AbstractAIHelper, MultiProviderAIHelper
from codegen.sdk.codebase.codebase_ai import generate_system_prompt, generate_tools
@@ -74,7 +73,6 @@
from codegen.sdk.typescript.statements.import_statement import TSImportStatement
from codegen.sdk.typescript.symbol import TSSymbol
from codegen.sdk.typescript.type_alias import TSTypeAlias
-from codegen.sdk.utils import determine_project_language, split_git_path
from codegen.shared.decorators.docs import apidoc, noapidoc, py_noapidoc
from codegen.shared.exceptions.control_flow import MaxAIRequestsError
from codegen.shared.performance.stopwatch_utils import stopwatch
@@ -119,7 +117,8 @@ def __init__(
self,
repo_path: None = None,
*,
- projects: list[ProjectConfig],
+ programming_language: None = None,
+ projects: list[ProjectConfig] | ProjectConfig,
config: CodebaseConfig = DefaultConfig,
) -> None: ...
@@ -128,6 +127,7 @@ def __init__(
self,
repo_path: str,
*,
+ programming_language: ProgrammingLanguage,
projects: None = None,
config: CodebaseConfig = DefaultConfig,
) -> None: ...
@@ -136,7 +136,8 @@ def __init__(
self,
repo_path: str | None = None,
*,
- projects: list[ProjectConfig] | None = None,
+ programming_language: ProgrammingLanguage | None = None,
+ projects: list[ProjectConfig] | ProjectConfig | None = None,
config: CodebaseConfig = DefaultConfig,
) -> None:
# Sanity check inputs
@@ -146,19 +147,16 @@ def __init__(
if repo_path is None and projects is None:
raise ValueError("Must specify either repo_path or projects")
+ if projects is not None and programming_language is not None:
+ raise ValueError("Cannot specify both projects and programming_language. Use ProjectConfig.from_path() to create projects with a custom programming_language.")
+
+ # If projects is a single ProjectConfig, convert it to a list
+ if isinstance(projects, ProjectConfig):
+ projects = [projects]
+
# Initialize project with repo_path if projects is None
if repo_path is not None:
- # Split repo_path into (git_root, base_path)
- repo_path = os.path.abspath(repo_path)
- git_root, base_path = split_git_path(repo_path)
- # Create repo_config
- repo_config = BaseRepoConfig()
- # Create main project
- main_project = ProjectConfig(
- repo_operator=LocalRepoOperator(repo_config=repo_config, repo_path=git_root),
- programming_language=determine_project_language(repo_path),
- base_path=base_path,
- )
+ main_project = ProjectConfig.from_path(repo_path, programming_language=programming_language)
projects = [main_project]
else:
main_project = projects[0]
@@ -1125,7 +1123,7 @@ def set_session_options(self, **kwargs: Unpack[SessionOptions]) -> None:
self.G.transaction_manager.reset_stopwatch(self.G.session_options.max_seconds)
@classmethod
- def from_repo(cls, repo_name: str, *, tmp_dir: str | None = None, commit: str | None = None, shallow: bool = True) -> "Codebase":
+ def from_repo(cls, repo_name: str, *, tmp_dir: str | None = None, commit: str | None = None, shallow: bool = True, programming_language: ProgrammingLanguage | None = None) -> "Codebase":
"""Fetches a codebase from GitHub and returns a Codebase instance.
Args:
@@ -1133,6 +1131,8 @@ def from_repo(cls, repo_name: str, *, tmp_dir: str | None = None, commit: str |
tmp_dir (Optional[str]): The directory to clone the repo into. Defaults to /tmp/codegen
commit (Optional[str]): The specific commit hash to clone. Defaults to HEAD
shallow (bool): Whether to do a shallow clone. Defaults to True
+ programming_language (ProgrammingLanguage | None): The programming language of the repo. Defaults to None.
+
Returns:
Codebase: A Codebase instance initialized with the cloned repository
"""
@@ -1163,7 +1163,6 @@ def from_repo(cls, repo_name: str, *, tmp_dir: str | None = None, commit: str |
# Ensure the operator can handle remote operations
repo_operator = LocalRepoOperator.create_from_commit(
repo_path=repo_path,
- default_branch="main", # We'll get the actual default branch after clone
commit=commit,
url=repo_url,
)
@@ -1171,7 +1170,7 @@ def from_repo(cls, repo_name: str, *, tmp_dir: str | None = None, commit: str |
# Initialize and return codebase with proper context
logger.info("Initializing Codebase...")
- project = ProjectConfig(repo_operator=repo_operator, programming_language=determine_project_language(repo_path))
+ project = ProjectConfig.from_repo_operator(repo_operator=repo_operator, programming_language=programming_language)
codebase = Codebase(projects=[project], config=DefaultConfig)
logger.info("Codebase initialization complete")
return codebase