diff --git a/docs/building-with-codegen/symbol-api.mdx b/docs/building-with-codegen/symbol-api.mdx
index c39ae90bb..8545fb5fe 100644
--- a/docs/building-with-codegen/symbol-api.mdx
+++ b/docs/building-with-codegen/symbol-api.mdx
@@ -38,7 +38,7 @@ All symbols share common APIs for manipulation:
- [symbol.source](/api-reference/core/Symbol#source)
- [symbol.docstring](/api-reference/core/Symbol#docstring)
- Edit operations
- - [symbol.set_docstring](/api-reference/core/Symbol#add_comment)
+ - [symbol.set_docstring](/api-reference/core/Symbol#set-docstring)
- [symbol.move_to_file](/api-reference/core/Symbol#move-to-file) (see [Moving Symbols](/building-with-codegen/moving-symbols))
- Graph relations (See [Usages and Dependencies](/building-with-codegen/dependencies-and-usages))
- [symbol.usages](/api-reference/core/Symbol#usages)
diff --git a/docs/mint.json b/docs/mint.json
index df6519df0..74d27f063 100644
--- a/docs/mint.json
+++ b/docs/mint.json
@@ -75,6 +75,7 @@
"tutorials/modularity",
"tutorials/deleting-dead-code",
"tutorials/increase-type-coverage",
+ "tutorials/training-data",
"tutorials/manage-feature-flags",
"tutorials/managing-typescript-exports",
"tutorials/converting-default-exports",
diff --git a/docs/tutorials/training-data.mdx b/docs/tutorials/training-data.mdx
new file mode 100644
index 000000000..6cd9c72d1
--- /dev/null
+++ b/docs/tutorials/training-data.mdx
@@ -0,0 +1,235 @@
+---
+title: "Generating Training Data for LLMs"
+sidebarTitle: "Training Data"
+description: "Learn how to generate training data for large language models using Codegen"
+icon: "network-wired"
+iconType: "solid"
+---
+
+This guide demonstrates how to use Codegen to generate high-quality training data for large language models (LLMs) by extracting function implementations along with their dependencies and usages. This approach is similar to [word2vec](https://www.tensorflow.org/text/tutorials/word2vec) or [node2vec](https://snap.stanford.edu/node2vec/) - given the context of a function, learn to predict the function's implementation.
+
+View the full code in our [examples repository](https://github.com/codegen-sh/codegen-examples/blob/main/generate_training_data/run.py)
+
+This example works with both Python and Typescript repositories without modification
+
+## Overview
+
+The process involves three main steps:
+
+1. Finding all functions in the codebase
+2. Extracting their implementations, dependencies, and usages
+3. Generating structured training data
+
+Let's walk through each step using Codegen.
+
+## Step 1: Finding Functions and Their Context
+
+First, we will do a "graph expansion" for each function - grab the function's source, as well as the full source of all usages of the function and all dependencies.
+
+See [dependencies and usages](/building-with-codegen/dependencies-and-usages) to learn more about navigating the code graph
+
+First, let's import the types we need from Codegen:
+
+```python
+import codegen
+from codegen import Codebase
+from codegen.sdk.core.external_module import ExternalModule
+from codegen.sdk.core.import_resolution import Import
+from codegen.sdk.core.symbol import Symbol
+```
+
+Here's how we get the full context for each function:
+
+```python
+def get_function_context(function) -> dict:
+ """Get the implementation, dependencies, and usages of a function."""
+ context = {
+ "implementation": {"source": function.source, "filepath": function.filepath},
+ "dependencies": [],
+ "usages": [],
+ }
+
+ # Add dependencies
+ for dep in function.dependencies:
+ # Hop through imports to find the root symbol source
+ if isinstance(dep, Import):
+ dep = hop_through_imports(dep)
+
+ context["dependencies"].append({"source": dep.source, "filepath": dep.filepath})
+
+ # Add usages
+ for usage in function.usages:
+ context["usages"].append({
+ "source": usage.usage_symbol.source,
+ "filepath": usage.usage_symbol.filepath,
+ })
+
+ return context
+```
+
+Notice how we use `hop_through_imports` to resolve dependencies. When working with imports, symbols can be re-exported multiple times. For example, a helper function might be imported and re-exported through several files before being used. We need to follow this chain to find the actual implementation:
+
+```python
+def hop_through_imports(imp: Import) -> Symbol | ExternalModule:
+ """Finds the root symbol for an import."""
+ if isinstance(imp.imported_symbol, Import):
+ return hop_through_imports(imp.imported_symbol)
+ return imp.imported_symbol
+```
+
+This creates a structured representation of each function's context:
+
+```json
+{
+ "implementation": {
+ "source": "def process_data(input: str) -> dict: ...",
+ "filepath": "src/data_processor.py"
+ },
+ "dependencies": [
+ {
+ "source": "def validate_input(data: str) -> bool: ...",
+ "filepath": "src/validators.py"
+ }
+ ],
+ "usages": [
+ {
+ "source": "result = process_data(user_input)",
+ "filepath": "src/api.py"
+ }
+ ]
+}
+```
+
+## Step 2: Processing the Codebase
+
+Next, we process all functions in the codebase to generate our training data:
+
+```python
+def run(codebase: Codebase):
+ """Generate training data using a node2vec-like approach for code embeddings."""
+ # Track all function contexts
+ training_data = {
+ "functions": [],
+ "metadata": {
+ "total_functions": len(codebase.functions),
+ "total_processed": 0,
+ "avg_dependencies": 0,
+ "avg_usages": 0,
+ },
+ }
+
+ # Process each function in the codebase
+ for function in codebase.functions:
+ # Skip if function is too small
+ if len(function.source.split("\n")) < 2:
+ continue
+
+ # Get function context
+ context = get_function_context(function)
+
+ # Only keep functions with enough context
+ if len(context["dependencies"]) + len(context["usages"]) > 0:
+ training_data["functions"].append(context)
+
+ # Update metadata
+ training_data["metadata"]["total_processed"] = len(training_data["functions"])
+ if training_data["functions"]:
+ training_data["metadata"]["avg_dependencies"] = sum(
+ len(f["dependencies"]) for f in training_data["functions"]
+ ) / len(training_data["functions"])
+ training_data["metadata"]["avg_usages"] = sum(
+ len(f["usages"]) for f in training_data["functions"]
+ ) / len(training_data["functions"])
+
+ return training_data
+```
+
+## Step 3: Running the Generator
+
+Finally, we can run our training data generator on any codebase.
+
+See [parsing codebases](/building-with-codegen/parsing-codebases) to learn more
+
+```python
+if __name__ == "__main__":
+ print("Initializing codebase...")
+ codebase = Codebase.from_repo("fastapi/fastapi")
+
+ print("Generating training data...")
+ training_data = run(codebase)
+
+ print("Saving training data...")
+ with open("training_data.json", "w") as f:
+ json.dump(training_data, f, indent=2)
+ print("Training data saved to training_data.json")
+```
+
+This will:
+1. Load the target codebase
+2. Process all functions
+3. Save the structured training data to a JSON file
+
+
+ You can use any Git repository as your source codebase by passing the repo URL
+ to [Codebase.from_repo(...)](/api-reference/core/codebase#from-repo).
+
+
+## Using the Training Data
+
+The generated data can be used to train LLMs in several ways:
+
+1. **Masked Function Prediction**: Hide a function's implementation and predict it from dependencies and usages
+2. **Code Embeddings**: Generate embeddings that capture semantic relationships between functions
+3. **Dependency Prediction**: Learn to predict which functions are likely to be dependencies
+4. **Usage Pattern Learning**: Train models to understand common usage patterns
+
+For example, to create a masked prediction task:
+
+```python
+def create_training_example(function_data):
+ """Create a masked prediction example from function data."""
+ return {
+ "context": {
+ "dependencies": function_data["dependencies"],
+ "usages": function_data["usages"]
+ },
+ "target": function_data["implementation"]
+ }
+
+# Create training examples
+examples = [create_training_example(f) for f in training_data["functions"]]
+```
+
+## Best Practices
+
+1. **Filter Small Functions**: Skip trivial functions that won't provide meaningful training data:
+```python
+if len(function.source.split("\n")) < 2:
+ continue
+```
+
+2. **Ensure Sufficient Context**: Only use functions with dependencies or usages:
+```python
+if len(context["dependencies"]) + len(context["usages"]) > 0:
+ training_data["functions"].append(context)
+```
+
+3. **Track Metadata**: Keep statistics about your training data:
+```python
+training_data["metadata"] = {
+ "total_functions": len(codebase.functions),
+ "total_processed": len(training_data["functions"]),
+ "avg_dependencies": average_dependencies,
+ "avg_usages": average_usages
+}
+```
+
+4. **Handle Import Chains**: Follow import chains to find root implementations:
+```python
+def hop_through_imports(imp: Import) -> Symbol | ExternalModule:
+ if isinstance(imp.imported_symbol, Import):
+ return hop_through_imports(imp.imported_symbol)
+ return imp.imported_symbol
+```
+
+By following these guidelines, you can generate high-quality training data for your LLM projects while maintaining code quality and consistency.
\ No newline at end of file
diff --git a/src/codegen/cli/api/endpoints.py b/src/codegen/cli/api/endpoints.py
index c44e0b81a..f8b56513b 100644
--- a/src/codegen/cli/api/endpoints.py
+++ b/src/codegen/cli/api/endpoints.py
@@ -9,3 +9,4 @@
LOOKUP_ENDPOINT = f"https://{MODAL_PREFIX}--cli-lookup.modal.run"
RUN_ON_PR_ENDPOINT = f"https://{MODAL_PREFIX}--cli-run-on-pull-request.modal.run"
PR_LOOKUP_ENDPOINT = f"https://{MODAL_PREFIX}--cli-pr-lookup.modal.run"
+CODEGEN_SYSTEM_PROMPT_URL = "https://gist.githubusercontent.com/jayhack/15681a2ceaccd726f19e6fdb3a44738b/raw/17c08054e3931b3b7fdf424458269c9e607541e8/codegen-system-prompt.txt"
diff --git a/src/codegen/cli/commands/init/render.py b/src/codegen/cli/commands/init/render.py
index 27b02749a..665db8246 100644
--- a/src/codegen/cli/commands/init/render.py
+++ b/src/codegen/cli/commands/init/render.py
@@ -6,5 +6,4 @@ def get_success_message(codegen_dir: Path, docs_dir: Path, examples_dir: Path) -
return """📁 .codegen configuration folder created:
[dim]config.toml[/dim] Project configuration
[dim]codemods/[/dim] Your codemod implementations
- [dim]jupyter/[/dim] Notebooks for codebase exploration
- [dim]prompts/[/dim] AI system prompts (gitignored)"""
+ [dim]codegen-system-prompt.txt[/dim] AI system prompt (gitignored)"""
diff --git a/src/codegen/cli/workspace/initialize_workspace.py b/src/codegen/cli/workspace/initialize_workspace.py
index eac71b5c3..bfcdbcf78 100644
--- a/src/codegen/cli/workspace/initialize_workspace.py
+++ b/src/codegen/cli/workspace/initialize_workspace.py
@@ -2,6 +2,7 @@
from contextlib import nullcontext
from pathlib import Path
+import requests
import rich
import toml
from rich.status import Status
@@ -78,6 +79,7 @@ def initialize_codegen(
CONFIG_PATH = CODEGEN_FOLDER / "config.toml"
JUPYTER_DIR = CODEGEN_FOLDER / "jupyter"
CODEMODS_DIR = CODEGEN_FOLDER / "codemods"
+ SYSTEM_PROMPT_PATH = CODEGEN_FOLDER / "codegen-system-prompt.txt"
# If status is a string, create a new spinner
context = create_spinner(f" {status} folders...") if isinstance(status, str) else nullcontext()
@@ -91,6 +93,16 @@ def initialize_codegen(
JUPYTER_DIR.mkdir(parents=True, exist_ok=True)
CODEMODS_DIR.mkdir(parents=True, exist_ok=True)
+ # Download system prompt
+ try:
+ from codegen.cli.api.endpoints import CODEGEN_SYSTEM_PROMPT_URL
+
+ response = requests.get(CODEGEN_SYSTEM_PROMPT_URL)
+ response.raise_for_status()
+ SYSTEM_PROMPT_PATH.write_text(response.text)
+ except Exception as e:
+ rich.print(f"[yellow]Warning: Could not download system prompt: {e}[/yellow]")
+
if not repo:
rich.print("No git repository found. Please run this command in a git repository.")
else:
@@ -152,6 +164,7 @@ def modify_gitignore(codegen_folder: Path):
"examples/",
"prompts/",
"jupyter/",
+ "codegen-system-prompt.txt", # Add system prompt to gitignore
"",
"# Python cache files",
"__pycache__/",