Skip to content

Commit ceeb41a

Browse files
author
codegen-bot
committed
.
1 parent 9743088 commit ceeb41a

File tree

6 files changed

+269
-3
lines changed

6 files changed

+269
-3
lines changed

docs/building-with-codegen/symbol-api.mdx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ All symbols share common APIs for manipulation:
3838
- [symbol.source](/api-reference/core/Symbol#source)
3939
- [symbol.docstring](/api-reference/core/Symbol#docstring)
4040
- Edit operations
41-
- [symbol.set_docstring](/api-reference/core/Symbol#add_comment)
41+
- [symbol.set_docstring](/api-reference/core/Symbol#set-docstring)
4242
- [symbol.move_to_file](/api-reference/core/Symbol#move-to-file) (see [Moving Symbols](/building-with-codegen/moving-symbols))
4343
- Graph relations (See [Usages and Dependencies](/building-with-codegen/dependencies-and-usages))
4444
- [symbol.usages](/api-reference/core/Symbol#usages)

docs/mint.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@
7575
"tutorials/modularity",
7676
"tutorials/deleting-dead-code",
7777
"tutorials/increase-type-coverage",
78+
"tutorials/training-data",
7879
"tutorials/manage-feature-flags",
7980
"tutorials/managing-typescript-exports",
8081
"tutorials/converting-default-exports",

docs/tutorials/training-data.mdx

Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,233 @@
1+
---
2+
title: "Generating Training Data for LLMs"
3+
sidebarTitle: "Training Data"
4+
description: "Learn how to generate training data for large language models using Codegen"
5+
icon: "network-wired"
6+
iconType: "solid"
7+
---
8+
9+
This guide demonstrates how to use Codegen to generate high-quality training data for large language models (LLMs) by extracting function implementations along with their dependencies and usages. This approach is similar to [word2vec](https://www.tensorflow.org/text/tutorials/word2vec) or [node2vec](https://snap.stanford.edu/node2vec/) - given the context of a function, learn to predict the function's implementation.
10+
11+
<Info>View the full code in our [examples repository](https://github.com/codegen-sh/codegen-examples/blob/main/generate_training_data/run.py)</Info>
12+
13+
## Overview
14+
15+
The process involves three main steps:
16+
17+
1. Finding all functions in the codebase
18+
2. Extracting their implementations, dependencies, and usages
19+
3. Generating structured training data
20+
21+
Let's walk through each step using Codegen.
22+
23+
## Step 1: Finding Functions and Their Context
24+
25+
First, we will do a "graph expansion" for each function - grab the function's source, as well as the full source of all usages of the function and all dependencies.
26+
27+
<Info>See [dependencies and usages](/building-with-codegen/dependencies-and-usages) to learn more about navigating the code graph</Info>
28+
29+
First, let's import the types we need from Codegen:
30+
31+
```python
32+
import codegen
33+
from codegen import Codebase
34+
from codegen.sdk.core.external_module import ExternalModule
35+
from codegen.sdk.core.import_resolution import Import
36+
from codegen.sdk.core.symbol import Symbol
37+
```
38+
39+
Here's how we get the full context for each function:
40+
41+
```python
42+
def get_function_context(function) -> dict:
43+
"""Get the implementation, dependencies, and usages of a function."""
44+
context = {
45+
"implementation": {"source": function.source, "filepath": function.filepath},
46+
"dependencies": [],
47+
"usages": [],
48+
}
49+
50+
# Add dependencies
51+
for dep in function.dependencies:
52+
# Hop through imports to find the root symbol source
53+
if isinstance(dep, Import):
54+
dep = hop_through_imports(dep)
55+
56+
context["dependencies"].append({"source": dep.source, "filepath": dep.filepath})
57+
58+
# Add usages
59+
for usage in function.usages:
60+
context["usages"].append({
61+
"source": usage.usage_symbol.source,
62+
"filepath": usage.usage_symbol.filepath,
63+
})
64+
65+
return context
66+
```
67+
68+
Notice how we use `hop_through_imports` to resolve dependencies. When working with imports, symbols can be re-exported multiple times. For example, a helper function might be imported and re-exported through several files before being used. We need to follow this chain to find the actual implementation:
69+
70+
```python
71+
def hop_through_imports(imp: Import) -> Symbol | ExternalModule:
72+
"""Finds the root symbol for an import."""
73+
if isinstance(imp.imported_symbol, Import):
74+
return hop_through_imports(imp.imported_symbol)
75+
return imp.imported_symbol
76+
```
77+
78+
This creates a structured representation of each function's context:
79+
80+
```json
81+
{
82+
"implementation": {
83+
"source": "def process_data(input: str) -> dict: ...",
84+
"filepath": "src/data_processor.py"
85+
},
86+
"dependencies": [
87+
{
88+
"source": "def validate_input(data: str) -> bool: ...",
89+
"filepath": "src/validators.py"
90+
}
91+
],
92+
"usages": [
93+
{
94+
"source": "result = process_data(user_input)",
95+
"filepath": "src/api.py"
96+
}
97+
]
98+
}
99+
```
100+
101+
## Step 2: Processing the Codebase
102+
103+
Next, we process all functions in the codebase to generate our training data:
104+
105+
```python
106+
def run(codebase: Codebase):
107+
"""Generate training data using a node2vec-like approach for code embeddings."""
108+
# Track all function contexts
109+
training_data = {
110+
"functions": [],
111+
"metadata": {
112+
"total_functions": len(codebase.functions),
113+
"total_processed": 0,
114+
"avg_dependencies": 0,
115+
"avg_usages": 0,
116+
},
117+
}
118+
119+
# Process each function in the codebase
120+
for function in codebase.functions:
121+
# Skip if function is too small
122+
if len(function.source.split("\n")) < 2:
123+
continue
124+
125+
# Get function context
126+
context = get_function_context(function)
127+
128+
# Only keep functions with enough context
129+
if len(context["dependencies"]) + len(context["usages"]) > 0:
130+
training_data["functions"].append(context)
131+
132+
# Update metadata
133+
training_data["metadata"]["total_processed"] = len(training_data["functions"])
134+
if training_data["functions"]:
135+
training_data["metadata"]["avg_dependencies"] = sum(
136+
len(f["dependencies"]) for f in training_data["functions"]
137+
) / len(training_data["functions"])
138+
training_data["metadata"]["avg_usages"] = sum(
139+
len(f["usages"]) for f in training_data["functions"]
140+
) / len(training_data["functions"])
141+
142+
return training_data
143+
```
144+
145+
## Step 3: Running the Generator
146+
147+
Finally, we can run our training data generator on any codebase.
148+
149+
<Note>See [parsing codebases](/building-with-codegen/parsing-codebases) to learn more</Note>
150+
151+
```python
152+
if __name__ == "__main__":
153+
print("Initializing codebase...")
154+
codebase = Codebase.from_repo("fastapi/fastapi")
155+
156+
print("Generating training data...")
157+
training_data = run(codebase)
158+
159+
print("Saving training data...")
160+
with open("training_data.json", "w") as f:
161+
json.dump(training_data, f, indent=2)
162+
print("Training data saved to training_data.json")
163+
```
164+
165+
This will:
166+
1. Load the target codebase
167+
2. Process all functions
168+
3. Save the structured training data to a JSON file
169+
170+
<Tip>
171+
You can use any Git repository as your source codebase by passing the repo URL
172+
to [Codebase.from_repo(...)](/api-reference/core/codebase#from-repo).
173+
</Tip>
174+
175+
## Using the Training Data
176+
177+
The generated data can be used to train LLMs in several ways:
178+
179+
1. **Masked Function Prediction**: Hide a function's implementation and predict it from dependencies and usages
180+
2. **Code Embeddings**: Generate embeddings that capture semantic relationships between functions
181+
3. **Dependency Prediction**: Learn to predict which functions are likely to be dependencies
182+
4. **Usage Pattern Learning**: Train models to understand common usage patterns
183+
184+
For example, to create a masked prediction task:
185+
186+
```python
187+
def create_training_example(function_data):
188+
"""Create a masked prediction example from function data."""
189+
return {
190+
"context": {
191+
"dependencies": function_data["dependencies"],
192+
"usages": function_data["usages"]
193+
},
194+
"target": function_data["implementation"]
195+
}
196+
197+
# Create training examples
198+
examples = [create_training_example(f) for f in training_data["functions"]]
199+
```
200+
201+
## Best Practices
202+
203+
1. **Filter Small Functions**: Skip trivial functions that won't provide meaningful training data:
204+
```python
205+
if len(function.source.split("\n")) < 2:
206+
continue
207+
```
208+
209+
2. **Ensure Sufficient Context**: Only use functions with dependencies or usages:
210+
```python
211+
if len(context["dependencies"]) + len(context["usages"]) > 0:
212+
training_data["functions"].append(context)
213+
```
214+
215+
3. **Track Metadata**: Keep statistics about your training data:
216+
```python
217+
training_data["metadata"] = {
218+
"total_functions": len(codebase.functions),
219+
"total_processed": len(training_data["functions"]),
220+
"avg_dependencies": average_dependencies,
221+
"avg_usages": average_usages
222+
}
223+
```
224+
225+
4. **Handle Import Chains**: Follow import chains to find root implementations:
226+
```python
227+
def hop_through_imports(imp: Import) -> Symbol | ExternalModule:
228+
if isinstance(imp.imported_symbol, Import):
229+
return hop_through_imports(imp.imported_symbol)
230+
return imp.imported_symbol
231+
```
232+
233+
By following these guidelines, you can generate high-quality training data for your LLM projects while maintaining code quality and consistency.

src/codegen/cli/api/endpoints.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,23 @@
99
LOOKUP_ENDPOINT = f"https://{MODAL_PREFIX}--cli-lookup.modal.run"
1010
RUN_ON_PR_ENDPOINT = f"https://{MODAL_PREFIX}--cli-run-on-pull-request.modal.run"
1111
PR_LOOKUP_ENDPOINT = f"https://{MODAL_PREFIX}--cli-pr-lookup.modal.run"
12+
13+
# Base URLs
14+
CODEGEN_API_URL = "https://api.codegen.sh"
15+
CODEGEN_WEB_URL = "https://codegen.sh"
16+
17+
# API endpoints
18+
CODEGEN_API_DOCS = f"{CODEGEN_API_URL}/docs"
19+
CODEGEN_API_EXAMPLES = f"{CODEGEN_API_URL}/examples"
20+
CODEGEN_API_CODEMOD = f"{CODEGEN_API_URL}/codemod"
21+
CODEGEN_API_CODEMOD_DEPLOY = f"{CODEGEN_API_URL}/codemod/deploy"
22+
CODEGEN_API_CODEMOD_DEPLOY_STATUS = f"{CODEGEN_API_URL}/codemod/deploy/status"
23+
CODEGEN_API_CODEMOD_DEPLOY_CANCEL = f"{CODEGEN_API_URL}/codemod/deploy/cancel"
24+
CODEGEN_API_CODEMOD_DEPLOY_LOGS = f"{CODEGEN_API_URL}/codemod/deploy/logs"
25+
26+
# Web URLs
27+
CODEGEN_WEB_PLAYGROUND = f"{CODEGEN_WEB_URL}/playground"
28+
CODEGEN_WEB_DOCS = f"{CODEGEN_WEB_URL}/docs"
29+
30+
# System prompt URL
31+
CODEGEN_SYSTEM_PROMPT_URL = "https://gist.githubusercontent.com/jayhack/15681a2ceaccd726f19e6fdb3a44738b/raw/17c08054e3931b3b7fdf424458269c9e607541e8/codegen-system-prompt.txt"

src/codegen/cli/commands/init/render.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,4 @@ def get_success_message(codegen_dir: Path, docs_dir: Path, examples_dir: Path) -
66
return """📁 .codegen configuration folder created:
77
[dim]config.toml[/dim] Project configuration
88
[dim]codemods/[/dim] Your codemod implementations
9-
[dim]jupyter/[/dim] Notebooks for codebase exploration
10-
[dim]prompts/[/dim] AI system prompts (gitignored)"""
9+
[dim]codegen-system-prompt.txt[/dim] AI system prompt (gitignored)"""

src/codegen/cli/workspace/initialize_workspace.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from contextlib import nullcontext
33
from pathlib import Path
44

5+
import requests
56
import rich
67
import toml
78
from rich.status import Status
@@ -78,6 +79,7 @@ def initialize_codegen(
7879
CONFIG_PATH = CODEGEN_FOLDER / "config.toml"
7980
JUPYTER_DIR = CODEGEN_FOLDER / "jupyter"
8081
CODEMODS_DIR = CODEGEN_FOLDER / "codemods"
82+
SYSTEM_PROMPT_PATH = CODEGEN_FOLDER / "codegen-system-prompt.txt"
8183

8284
# If status is a string, create a new spinner
8385
context = create_spinner(f" {status} folders...") if isinstance(status, str) else nullcontext()
@@ -91,6 +93,16 @@ def initialize_codegen(
9193
JUPYTER_DIR.mkdir(parents=True, exist_ok=True)
9294
CODEMODS_DIR.mkdir(parents=True, exist_ok=True)
9395

96+
# Download system prompt
97+
try:
98+
from codegen.cli.api.endpoints import CODEGEN_SYSTEM_PROMPT_URL
99+
100+
response = requests.get(CODEGEN_SYSTEM_PROMPT_URL)
101+
response.raise_for_status()
102+
SYSTEM_PROMPT_PATH.write_text(response.text)
103+
except Exception as e:
104+
rich.print(f"[yellow]Warning: Could not download system prompt: {e}[/yellow]")
105+
94106
if not repo:
95107
rich.print("No git repository found. Please run this command in a git repository.")
96108
else:
@@ -152,6 +164,7 @@ def modify_gitignore(codegen_folder: Path):
152164
"examples/",
153165
"prompts/",
154166
"jupyter/",
167+
"codegen-system-prompt.txt", # Add system prompt to gitignore
155168
"",
156169
"# Python cache files",
157170
"__pycache__/",

0 commit comments

Comments
 (0)