example: generating data for large-scale pretraining (#13)

jayhack · web-flow · commit b5c9dcf30a3f · 2025-01-26T17:16:05.000-08:00
* .

* .
diff --git a/generate_training_data/README.md b/generate_training_data/README.md
@@ -0,0 +1,93 @@
+# Generate Codebase Pre-Training Data
+
+[![Documentation](https://img.shields.io/badge/docs-docs.codegen.com-blue)](https://docs.codegen.com/tutorials/generate-training-data)
+
+This example demonstrates how to use Codegen to generate training data for large-scale LLM pre-training by extracting function implementations along with their dependencies and usages. The approach is inspired by node2vec, leveraging code graphs for learning.
+
+## What This Example Does
+
+The script analyzes your codebase and generates training data by:
+
+1. **Finding All Functions**
+   - Scans the entire codebase to identify function definitions
+   - Filters out trivial functions (less than 2 lines)
+
+2. **Capturing Implementation Context**
+   ```python
+   {
+     "implementation": {
+       "source": "def process_data():\n    ...",
+       "filepath": "src/process.py"
+     }
+   }
+   ```
+
+3. **Extracting Dependencies**
+   ```python
+   {
+     "dependencies": [
+       {
+         "source": "def helper_function():\n    ...",
+         "filepath": "src/helpers.py"
+       }
+     ]
+   }
+   ```
+
+4. **Recording Usages**
+   ```python
+   {
+     "usages": [
+       {
+         "source": "result = process_data()",
+         "filepath": "src/main.py"
+       }
+     ]
+   }
+   ```
+
+## Running the Example
+
+```bash
+# Install Codegen
+pip install codegen
+
+# Run the data generation
+python run.py
+```
+
+The script will analyze your codebase and output a `training_data.json` file containing the structured training data.
+
+## Understanding the Code
+
+- `run.py` - The main script that generates the training data
+  - Uses `get_function_context()` to extract implementation, dependencies, and usages
+  - Processes each function and builds a comprehensive context graph
+  - Outputs structured JSON data with metadata about the processing
+
+## Output Format
+
+The generated `training_data.json` follows this structure:
+```json
+{
+  "functions": [
+    {
+      "implementation": { "source": "...", "filepath": "..." },
+      "dependencies": [{ "source": "...", "filepath": "..." }],
+      "usages": [{ "source": "...", "filepath": "..." }]
+    }
+  ],
+  "metadata": {
+    "total_functions": 100,
+    "total_processed": 85,
+    "avg_dependencies": 2.5,
+    "avg_usages": 3.2
+  }
+}
+```
+
+## Learn More
+
+- [Full Tutorial](https://docs.codegen.com/tutorials/generate-training-data)
+- [Code Model Pre-training](https://docs.codegen.com/concepts/code-model-training)
+- [Codegen Documentation](https://docs.codegen.com) 
diff --git a/generate_training_data/run.py b/generate_training_data/run.py
@@ -0,0 +1,109 @@
+import json
+
+import codegen
+from codegen import Codebase
+from codegen.sdk.core.external_module import ExternalModule
+from codegen.sdk.core.import_resolution import Import
+from codegen.sdk.core.symbol import Symbol
+
+
+def hop_through_imports(imp: Import) -> Symbol | ExternalModule:
+    """Finds the root symbol for an import"""
+    if isinstance(imp.imported_symbol, Import):
+        return hop_through_imports(imp.imported_symbol)
+    return imp.imported_symbol
+
+
+def get_function_context(function) -> dict:
+    """Get the implementation, dependencies, and usages of a function."""
+    context = {
+        "implementation": {"source": function.source, "filepath": function.filepath},
+        "dependencies": [],
+        "usages": [],
+    }
+
+    # Add dependencies
+    for dep in function.dependencies:
+        # Hop through imports to find the root symbols ource
+        if isinstance(dep, Import):
+            dep = hop_through_imports(dep)
+
+        context["dependencies"].append({"source": dep.source, "filepath": dep.filepath})
+
+    # Add usages
+    for usage in function.usages:
+        context["usages"].append(
+            {
+                "source": usage.usage_symbol.source,
+                "filepath": usage.usage_symbol.filepath,
+            }
+        )
+
+    return context
+
+
+@codegen.function("generate-training-data")
+def run(codebase: Codebase):
+    """Generate training data using a node2vec-like approach for code embeddings.
+
+    This codemod:
+    1. Finds all functions in the codebase
+    2. For each function:
+       - Captures its implementation
+       - Lists all dependencies (with their implementations)
+       - Lists all usages (with their implementations)
+    3. Outputs structured JSON data for training
+    """
+    # Track all function contexts
+    training_data = {
+        "functions": [],
+        "metadata": {
+            "total_functions": len(codebase.functions),
+            "total_processed": 0,
+            "avg_dependencies": 0,
+            "avg_usages": 0,
+        },
+    }
+
+    # Process each function in the codebase
+    for function in codebase.functions:
+        # Skip if function is too small
+        if len(function.source.split("\n")) < 2:
+            continue
+
+        # Get function context
+        context = get_function_context(function)
+
+        # Only keep functions with enough context
+        if len(context["dependencies"]) + len(context["usages"]) > 0:
+            training_data["functions"].append(context)
+
+    # Update metadata
+    training_data["metadata"]["total_processed"] = len(training_data["functions"])
+    if training_data["functions"]:
+        training_data["metadata"]["avg_dependencies"] = sum(
+            len(f["dependencies"]) for f in training_data["functions"]
+        ) / len(training_data["functions"])
+        training_data["metadata"]["avg_usages"] = sum(
+            len(f["usages"]) for f in training_data["functions"]
+        ) / len(training_data["functions"])
+
+    # Print stats
+    print(f"Processed {training_data['metadata']['total_processed']} functions")
+    print(f"Average dependencies: {training_data['metadata']['avg_dependencies']:.2f}")
+    print(f"Average usages: {training_data['metadata']['avg_usages']:.2f}")
+
+    return training_data
+
+
+if __name__ == "__main__":
+    print("Initializing codebase...")
+    codebase = Codebase.from_repo("fastapi/fastapi")
+
+    print("Generating training data...")
+    training_data = run(codebase)
+
+    print("Saving training data...")
+    with open("training_data.json", "w") as f:
+        json.dump(training_data, f, indent=2)
+    print("Training data saved to training_data.json")