diff --git a/.github/workflows/deploy-staging.yml b/.github/workflows/deploy-staging.yml index 756386ac..a7330ef6 100644 --- a/.github/workflows/deploy-staging.yml +++ b/.github/workflows/deploy-staging.yml @@ -3,6 +3,18 @@ name: Deploy Staging Branch on: push: branches: [ staging ] + paths-ignore: + # Ignore changes that don't affect the site + - 'src/assignments/**' + - 'scripts/**' + - 'tests/**' + - 'private/**' + - '**.md' + - '.github/**' + - 'data/**' + - 'pyproject.toml' + - 'pytest.ini' + - 'uv.lock' jobs: deploy: diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index f3e353a5..da3d9bda 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -3,6 +3,18 @@ name: Deploy to Production on: push: branches: [ main ] + paths-ignore: + # Ignore changes that don't affect the site + - 'src/assignments/**' + - 'scripts/**' + - 'tests/**' + - 'private/**' + - '**.md' + - '.github/**' + - 'data/**' + - 'pyproject.toml' + - 'pytest.ini' + - 'uv.lock' jobs: deploy: @@ -11,7 +23,7 @@ jobs: - name: Deploy to Remote Server uses: appleboy/ssh-action@v1.0.3 with: - host: ristoffer.ch + host: direct.ristoffer.ch username: crh key: ${{ secrets.SSH_DEPLOY_KEY }} script: | diff --git a/.github/workflows/shadow-tester.yml b/.github/workflows/shadow-tester.yml new file mode 100644 index 00000000..1fef1a47 --- /dev/null +++ b/.github/workflows/shadow-tester.yml @@ -0,0 +1,139 @@ +name: Shadow Tester + +on: + repository_dispatch: + types: [run-shadow-tests] + +permissions: + contents: read + pull-requests: write # Needed to comment on PRs + +jobs: + shadow-test: + runs-on: ubuntu-latest + steps: + - name: Checkout Private Repo + uses: actions/checkout@v4 + with: + repository: crheckman/private-vla-foundations + token: ${{ secrets.PRIVATE_REPO_TOKEN }} # PAT with access to private repo + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.10' + + - name: Install Python Dependencies + run: | + pip install pytest torch numpy + + - name: Fetch Student Code from Public PR + env: + PR_NUMBER: ${{ github.event.client_payload.pr_number }} + HEAD_BRANCH: ${{ github.event.client_payload.head_branch }} + HEAD_SHA: ${{ github.event.client_payload.head_sha }} + REPO_URL: ${{ github.event.client_payload.repo_url }} + run: | + echo "Fetching student code from PR #${PR_NUMBER}" + + # Clone the public repo + git clone https://github.com/arpg/vla-foundations.git /tmp/public-repo + cd /tmp/public-repo + + # Fetch the PR branch + git fetch origin pull/${PR_NUMBER}/head:pr-${PR_NUMBER} + git checkout pr-${PR_NUMBER} + + # Copy student code to our testing directory + # Copy src/assignments to the current repo + if [ -d "src/assignments" ]; then + cp -r src/assignments/* $GITHUB_WORKSPACE/src/assignments/ || true + fi + + echo "Student code fetched successfully" + + - name: Run Internal Rigorous Tests + id: tests + continue-on-error: true + run: | + # Run pytest with internal tests + pytest tests/internal/ -v --tb=short --maxfail=5 > test_output.txt 2>&1 + TEST_EXIT_CODE=$? + + # Capture output + cat test_output.txt + + # Save exit code for later + echo "exit_code=${TEST_EXIT_CODE}" >> $GITHUB_OUTPUT + + # Exit with the actual test result + exit $TEST_EXIT_CODE + + - name: Prepare Test Summary + if: always() + id: summary + run: | + if [ -f test_output.txt ]; then + # Extract summary from pytest output + SUMMARY=$(tail -20 test_output.txt | grep -E "(PASSED|FAILED|ERROR)" || echo "Test execution completed") + + # Escape newlines for GitHub output + SUMMARY="${SUMMARY//$'\n'/'%0A'}" + echo "summary=${SUMMARY}" >> $GITHUB_OUTPUT + else + echo "summary=No test output available" >> $GITHUB_OUTPUT + fi + + - name: Comment on Public PR - Pass + if: steps.tests.outcome == 'success' + uses: peter-evans/create-or-update-comment@v3 + with: + token: ${{ secrets.GITHUB_TOKEN }} # Default token works for same repo + repository: arpg/vla-foundations + issue-number: ${{ github.event.client_payload.pr_number }} + body: | + ## ✅ Shadow CI: Internal Tests Passed + + Your submission passed all internal rigorous tests! + +
+ Test Summary + + ``` + ${{ steps.summary.outputs.summary }} + ``` + +
+ + --- + *These are hidden internal tests run by the instructor. Your code meets the required standards.* + + - name: Comment on Public PR - Fail + if: steps.tests.outcome == 'failure' + uses: peter-evans/create-or-update-comment@v3 + with: + token: ${{ secrets.GITHUB_TOKEN }} # Default token works for same repo + repository: arpg/vla-foundations + issue-number: ${{ github.event.client_payload.pr_number }} + body: | + ## ❌ Shadow CI: Internal Tests Failed + + Your submission did not pass all internal tests. Please review the feedback and make necessary corrections. + +
+ Test Summary + + ``` + ${{ steps.summary.outputs.summary }} + ``` + +
+ + ### Next Steps: + 1. Review the test failures above + 2. Make corrections to your code + 3. Push updates to your PR branch + 4. Tests will automatically re-run + + --- + *These are hidden internal tests run by the instructor. Contact @crheckman if you need clarification on the failures.* diff --git a/.github/workflows/vla-audit.yml b/.github/workflows/vla-audit.yml index b8237a36..afa5a0d5 100644 --- a/.github/workflows/vla-audit.yml +++ b/.github/workflows/vla-audit.yml @@ -42,7 +42,15 @@ jobs: ### Common Issues: - **1. Semantic Line Breaks** + **1. Required Frontmatter Fields** + - Every audit MDX file must include these fields: + - `title`: Paper title + - `author`: Paper author(s) + - `topic`: Research topic/category + - `paper`: Link to paper or citation + - All fields must have non-empty values (no placeholders like "TBD" or "TODO") + + **2. Semantic Line Breaks** - Each sentence should be on its own line - This makes PR commenting and reviewing much easier - Example: @@ -53,7 +61,7 @@ jobs: + This makes PR review much easier. ``` - **2. Clean Git History** + **3. Clean Git History** - No "Merge branch 'main'" commits allowed - Use `git rebase main` instead of `git merge main` - Keep your commit history linear and clean @@ -144,22 +152,3 @@ jobs: --- *This preview will be removed when the PR is closed.* - - trigger-shadow-tests: - runs-on: ubuntu-latest - if: github.event_name == 'pull_request' && github.base_ref == 'staging' - needs: audit - steps: - - name: Trigger Shadow CI in Private Repo - uses: peter-evans/repository-dispatch@v2 - with: - token: ${{ secrets.PRIVATE_DISPATCH_TOKEN }} - repository: crheckman/private-vla-foundations - event-type: run-shadow-tests - client-payload: | - { - "pr_number": "${{ github.event.pull_request.number }}", - "head_branch": "${{ github.event.pull_request.head.ref }}", - "head_sha": "${{ github.event.pull_request.head.sha }}", - "repo_url": "${{ github.event.pull_request.head.repo.clone_url }}" - } diff --git a/.gitignore b/.gitignore index 3519214c..4eea2148 100644 --- a/.gitignore +++ b/.gitignore @@ -53,15 +53,13 @@ __pycache__/ # project-specific /arxiv-digest/ -# Private repo files - do not commit to public -private/ -tests/ -scripts/_sanitize_todos.py -scripts/manage_solutions.py -scripts/sanitize.sh -scripts/setup_private_repo.sh -scripts/add_github_secret.sh -pytest.ini -PRIVATE_REPO_SETUP.md -SETUP_WITH_GH_CLI.md -QUICK_REFERENCE.md + +# Private solution infrastructure (NEVER commit to public branches) +*.backup.py + +# Claude Code skill outputs (generated reports) +.claude/releases/*.md +.claude/sync-reports/*.md +!.claude/releases/.gitkeep +!.claude/sync-reports/.gitkeep +!tests/internal/reports/.gitkeep diff --git a/README.md b/README.md index 323ea9e7..f83a27ee 100644 --- a/README.md +++ b/README.md @@ -83,103 +83,6 @@ git push --force-with-lease --- -## Repository Structure - -``` -vla-foundations/ -├── app/ # Next.js App Router (web framework) -│ ├── page.tsx # Landing page -│ ├── textbook/[slug]/ # Dynamic chapter pages -│ ├── course/ # Course overview page -│ │ └── assignments/[slug]/ # Dynamic assignment pages -│ └── contributors/[slug]/ # Dynamic contributor profile pages -│ -├── content/ # All MDX content (rendered as web pages) -│ ├── textbook/ # 8-chapter VLA textbook -│ │ ├── foundations/ # Chapter 0: Core concepts -│ │ ├── architectures/ # Chapter 1: Model designs -│ │ ├── data/ # Chapter 2: Dataset construction -│ │ ├── training/ # Chapter 3: Optimization methods -│ │ ├── evaluation/ # Chapter 4: Metrics and benchmarks -│ │ ├── deployment/ # Chapter 5: Production systems -│ │ ├── applications/ # Chapter 6: Real-world use cases -│ │ └── future/ # Chapter 7: Open problems -│ │ -│ ├── course/ # Course materials -│ │ ├── Syllabus.mdx # Course syllabus -│ │ ├── assignments/ # Assignment specifications -│ │ └── submissions/ # Student submission reports -│ │ -│ └── contributors/ # Contributor profiles -│ └── [github-handle].mdx # One profile per contributor -│ -└── src/ # Executable source code - └── assignments/ # Assignment code templates - └── scratch-1/ # Example: Transformer implementation - ├── README.md # Minimal README - ├── backbone.py # Implementation template with TODOs - └── generate_data.py # Dataset generator script -``` - ---- - -## The 8-Chapter Textbook - -0. **Foundations** - Core concepts and problem formulation -1. **Architectures** - Model designs and network topologies -2. **Data** - Dataset construction and curation strategies -3. **Training** - Optimization and fine-tuning methods -4. **Evaluation** - Metrics and benchmarking protocols -5. **Deployment** - Production systems and scaling -6. **Applications** - Real-world use cases and case studies -7. **Future Directions** - Open problems and research frontiers - ---- - -## Development Workflow - -### Initial Setup - -```bash -# Clone the repository -git clone https://github.com/arpg/vla-foundations.git -cd vla-foundations - -# Install dependencies -pnpm install - -# Run development server -pnpm dev -``` - -Navigate to `http://localhost:3000` to see the site. - -### Local Build - -```bash -# Build the static site -pnpm build - -# Preview the production build -pnpm start -``` - ---- - -## Technologies - -### Core -- **Next.js 16**: Static site generation -- **TypeScript**: Type safety -- **Tailwind CSS**: Styling -- **MDX**: Markdown with JSX - -### Content Processing -- **remark-math** + **rehype-katex**: LaTeX rendering -- **remark-gfm**: GitHub-flavored Markdown - ---- - ## Resources ### Documentation diff --git a/claude.md b/claude.md new file mode 100644 index 00000000..6837d90b --- /dev/null +++ b/claude.md @@ -0,0 +1,644 @@ +# VLA Foundations Development Guide for AI SWE Agents (Private Repo) + +This is the **private instructor repository** for VLA Foundations, containing complete assignment solutions, internal grading tests, and instructor operations. The public student-facing repository is at `arpg/vla-foundations`. This repo uses **Next.js (App Router)** for the textbook, **Tailwind CSS** for styling, **MDX** for content, and **pnpm** for package management. + +Read more about the dual-repository architecture in [INSTRUCTOR.md](INSTRUCTOR.md). + +--- + +## Repository Architecture + +This is a **two-repository system**: + +``` +Private Repo (crheckman/private-vla-foundations) +├── private/ # Complete assignment solutions (NEVER PUBLIC) +│ └── solutions/ +├── tests/internal/ # Internal grading tests (NEVER PUBLIC) +│ ├── fixtures/ # Gold standard test data +│ └── reports/ # Grading reports (git-ignored) +├── scripts/ +│ ├── dev_utils.py # Solution management (inject/reset/verify-clean) +│ ├── sanitize.sh # Automated sanitization pipeline +│ └── _sanitize_todos.py # TODO comment sanitizer +├── .claude/ +│ ├── skills/ # Claude Code skills for automation +│ └── commands/ # Slash command shortcuts +└── src/assignments/ # Starter code with [SOLUTION] hints + + ↓ (Orphan push on release tag) + +Public Repo (arpg/vla-foundations) +├── src/assignments/ # Starter code (TODOs only) +├── tests/public/ # Student-visible tests +├── content/ # Textbook and assignment specs +└── [NO private/ or tests/internal/] +``` + +**Critical**: Never commit `private/` or `tests/internal/` to public branches. + +--- + +## Initial Setup + +### Prerequisites +```bash +# Install dependencies +pnpm install + +# Install uv (Python package manager) - REQUIRED +curl -LsSf https://astral.sh/uv/install.sh | sh + +# Install Python dependencies via uv +uv sync + +# Install GitHub CLI (required for skills) +brew install gh +gh auth login +``` + +### Python Environment (uv) +**All Python commands MUST use `uv run`** to ensure correct dependencies: +```bash +# Run Python scripts +uv run python scripts/dev_utils.py --list + +# Run pytest +uv run pytest tests/internal/ -v -m rigor + +# Run any Python file +uv run python src/assignments/scratch-1/generate_data.py +``` + +### Development +```bash +# Run development server +pnpm dev + +# Build production (static export in out/) +pnpm build + +# Lint Next.js +pnpm lint +``` + +--- + +## Claude Code Skills (Automation) + +This repository has **7 Claude Code skills** for workflow automation. See [.claude/skills/README.md](.claude/skills/README.md) for complete documentation. + +### Core Skills + +#### `/vla-guard` - Solution Leak Audit +**Purpose**: Prevent solution leaks before any public operation + +**Usage**: +```bash +/vla-guard +``` + +**What it does**: +- Scans for `[SOLUTION]` markers in `src/` and `content/` +- Verifies `private/` and `tests/internal/` not staged +- Checks git history for accidental commits +- Runs `dev_utils.py --verify-clean` (similarity detection) +- **Blocks** sync if any check fails + +**When to use**: Before every push, PR, or release + +--- + +#### `/test-rigor` - Internal Grading Tests +**Purpose**: Run internal grading tests with automatic solution injection/reset + +**Usage**: +```bash +/test-rigor +# Select: "Scratch-1" / "Scratch-2" / "All" +``` + +**What it does**: +1. Injects solutions: `python3 scripts/dev_utils.py --inject ` +2. Runs pytest: `pytest tests/internal/ -v -m rigor` +3. Generates report: `tests/internal/reports/test-report-.txt` +4. Resets to starter code: `python3 scripts/dev_utils.py --reset ` + +**Safe to run multiple times** - always resets after completion. + +--- + +#### `/generate-fixtures` - Gold Standard Fixtures +**Purpose**: Generate reference data for fidelity tests from solution code + +**Usage**: +```bash +/generate-fixtures +# Select assignment +``` + +**What it does**: +1. Injects solutions +2. Sets fixed random seeds (seed=42) +3. Runs solution code to generate outputs +4. Saves to `tests/internal/fixtures//gold_output.pt` +5. Verifies no NaNs +6. Generates fixture documentation +7. Resets to starter code + +**When to use**: After completing solution implementation or updating solution code + +--- + +#### `/grade` - Automated PR Grading +**Purpose**: Complete grading workflow for student pull requests + +**Usage**: +```bash +/grade +# Enter PR number or auto-detect latest +``` + +**What it does**: +1. Fetches student code from GitHub PR +2. Runs VLA Guard on student code (detect plagiarism/leaks) +3. Runs `tests/public/` (student-visible tests) +4. Injects reference solution +5. Runs `tests/internal/` (gradient leak, fidelity, training tests) +6. Restores student code +7. Generates detailed markdown feedback report +8. Posts comment on PR (optional) +9. Updates PR labels (ready-to-merge / needs-revision / changes-requested) + +**Output**: `tests/internal/reports/grade-pr.md` + +**When to use**: When reviewing student submissions + +--- + +#### `/release` - Safe Assignment Publishing +**Purpose**: Orchestrate complete release workflow with comprehensive safety checks + +**Usage**: +```bash +/release +# Select: "Scratch-1" / "Scratch-2" / etc. +``` + +**What it does**: +1. Verifies on main branch, no uncommitted changes +2. Runs `/vla-guard` pre-flight audit (fail-fast) +3. Prompts for release tag (e.g., `release-scratch-2`) +4. Shows changes since last release +5. Runs `scripts/sanitize.sh` (removes private/, [SOLUTION] markers, etc.) +6. Verifies sanitization (fail-safe) +7. Creates annotated git tag +8. Pushes tag → triggers `.github/workflows/sync-to-public.yml` +9. Monitors GitHub Actions workflow execution +10. Verifies public repository (no leaks) +11. Checks deployment status (https://www.vlm-robotics.dev) +12. Generates release summary: `.claude/releases/release-.md` + +**Fail-safe**: Aborts at ANY failed check, provides remediation instructions + +**When to use**: When ready to publish assignment to students + +--- + +#### `/new-assignment` - Assignment Scaffolding +**Purpose**: Create complete assignment structure with templates + +**Usage**: +```bash +/new-assignment +# Enter name, type, focus, difficulty +``` + +**What it does**: +1. Creates directory structure: + - `src/assignments//` (starter code with TODOs) + - `private/solutions//` (solution templates) + - `tests/public/test__basic.py` (student-visible tests) + - `tests/internal/test__rigor.py` (grading tests) + - `content/course/assignments/.mdx` (assignment spec) +2. Generates Python templates +3. Generates test templates +4. Creates README files + +**Next steps after scaffolding**: +1. Complete solution implementations +2. Run `/generate-fixtures` +3. Update MDX spec +4. Run `/test-rigor` +5. Commit changes +6. Run `/release` + +--- + +#### `/sync-check` - Post-Release Verification +**Purpose**: Verify public repository has no leaks after release sync + +**Usage**: +```bash +/sync-check +# Select: "Latest" or specify release tag +``` + +**What it does**: +1. Clones public repo to temp directory (read-only) +2. Scans for `[SOLUTION]` markers +3. Checks for private directories (`private/`, `tests/internal/`) +4. Checks for private scripts (`dev_utils.py`, `sanitize.sh`) +5. Checks for sensitive files (credentials, `*_solution.py`) +6. Verifies orphan push strategy (no linked git history) +7. Compares file lists (private vs public) +8. Checks deployment status (HTTPS 200) +9. Runs sample fidelity check +10. Generates verification report: `.claude/sync-reports/sync-check-.md` +11. Cleans up temp files + +**When to use**: Always run after `/release` completes + +**Critical**: If leaks detected, report provides urgent remediation steps + +--- + +## Commands Useful in Development + +### Solution Management +```bash +# List all available solutions +uv run python scripts/dev_utils.py --list + +# Inject solutions for testing/grading +uv run python scripts/dev_utils.py --inject scratch-1 + +# Reset to starter code +uv run python scripts/dev_utils.py --reset scratch-1 + +# Verify no solution leaks (similarity check) +uv run python scripts/dev_utils.py --verify-clean +``` + +### Testing +```bash +# Run public tests (students can see these) +uv run pytest tests/public/ -v + +# Run internal grading tests (after injecting solutions) +uv run pytest tests/internal/ -v -m rigor + +# Run specific test file +uv run pytest tests/internal/test_scratch1_rigor.py -v + +# Generate HTML report +uv run pytest tests/internal/ --html=tests/internal/reports/report.html --self-contained-html +``` + +### Pre-Release Checks +```bash +# Complete pre-flight check +/pre-flight + +# Or manually: +uv run python scripts/dev_utils.py --verify-clean +bash scripts/sanitize.sh # (Only in orphan branch workflow) +``` + +### GitHub Operations +```bash +# List open student PRs +gh pr list --base staging --state open + +# View PR details +gh pr view 123 + +# Comment on PR +gh pr comment 123 --body "Feedback here" + +# Merge PR +gh pr merge 123 --squash +``` + +--- + +## Linting and Formatting + +### Semantic Line Breaks +**All MDX files MUST use one sentence per line.** This is mandatory to allow granular, line-by-line feedback in Pull Requests. + +**Bad:** +```markdown +This is a very long sentence with multiple ideas. It continues on the same line. This makes PR review difficult. +``` + +**Good:** +```markdown +This is a sentence on its own line. +Each idea gets its own line. +This makes PR review much easier. +``` + +### LaTeX +Use formal LaTeX for all mathematical derivations: +```markdown +The loss function is: +$$ +\mathcal{L} = -\sum_{t=1}^T \log p(a_t | s_t, I_t) +$$ +``` + +Do not use code blocks for math. + +### Next.js Linting +```bash +pnpm lint +``` + +--- + +## Testing Philosophy + +### Public Tests (`tests/public/`) +**Purpose**: Student-visible validation tests + +**What they test**: +- Basic model structure (initialization, shapes) +- Forward pass correctness (no NaNs, correct dimensions) +- Gradient flow (backpropagation works) + +**Students can run these**: `pytest tests/public/test_scratch1_basic.py -v` + +### Internal Tests (`tests/internal/`) +**Purpose**: Rigorous grading tests (NEVER synced to public) + +**What they test**: +- **Gradient Leak Test**: Verify frozen parameters (e.g., DINOv2 backbone) +- **Latent Fidelity Test**: Compare output against gold standard fixtures +- **Training Convergence Test**: Verify model can train and loss decreases +- **Edge Case Tests**: Boundary conditions, error handling + +**Markers**: +- `@pytest.mark.internal` - All internal tests +- `@pytest.mark.rigor` - Strict grading tests +- `@pytest.mark.gradient` - Gradient flow tests +- `@pytest.mark.fidelity` - Output comparison tests +- `@pytest.mark.training` - Training convergence tests + +**Run with**: `pytest tests/internal/ -v -m rigor` + +--- + +## Interacting with the App + +### Local Development +```bash +pnpm dev +# Access at http://localhost:3000 +``` + +### Staging Previews +Every Pull Request to `staging` branch triggers deployment to: +``` +https://vlm-robotics.dev/staging/pulls/[PR_NUMBER]/ +``` + +**Review Protocol**: +1. Read the rendered audit on the staging site +2. Comment on the **source MDX** in GitHub "Files Changed" tab +3. Use the **Rich Diff** view in GitHub to verify LaTeX rendering + +### Production +Production site deployed at: +``` +https://www.vlm-robotics.dev +``` + +Deployment triggered by: +- Push to `main` branch (after staging → main merge) +- GitHub Action: `.github/workflows/deploy.yml` +- Deploys to ristoffer.ch via SSH + +--- + +## Patterns & Standards + +### Amazon Principle +We do not write "summaries." We write rigorous, durable **Audits**. A high-fidelity audit IS the textbook chapter. + +### Textbook Audit Sidebars +Every audit MUST contain these three technical sidebars: + +1. **The Lineage of Failure**: Why previous approaches died +2. **Intuitive Derivation**: The geometric/mathematical intuition of the loss function +3. **Implementation Gotchas**: Practitioners' notes on coordinate frames, normalization, or hyperparameters + +### The Interface Focus +When auditing VLA models, focus on the **Interface**: +- **Input Projection**: Pixels → Tokens +- **Action Head**: Tokens → Trajectories +- **The Loss/Objective Function** + +### Git Hygiene +We are a **rebase-only** lab. Use `git rebase main`. PRs containing "Merge branch 'main'" commits will be closed. + +**Correct workflow**: +```bash +git fetch origin +git rebase origin/main +git push --force-with-lease +``` + +### Sanitization +All private solutions are marked with `[SOLUTION]` tags: +```python +# TODO: Implement RMSNorm forward pass +# [SOLUTION] Use torch.rsqrt for efficiency +result = torch.rsqrt(variance + self.eps) +``` + +The sanitization pipeline: +1. `scripts/_sanitize_todos.py` - Removes `[SOLUTION]` markers +2. `scripts/sanitize.sh` - Orchestrates full cleanup (private dirs, scripts, README) +3. Triggered automatically by `.github/workflows/sync-to-public.yml` on release tags + +**Load-bearing wall**: `scripts/sanitize.sh` is the primary defense against solution leaks. + +### Orphan Push Strategy +When syncing to public repo, we use **orphan branches** to break all git history links: + +```bash +git checkout --orphan temp-public-branch +git add -A +git commit -m "Public Release: $(date)" +git push public temp-public-branch:main --force +``` + +**Benefits**: +- No commit history from private repo exposed +- Public repo has completely independent git history +- Maximum security against accidental leaks via `git log` + +--- + +## File Map of Interest + +### GitHub Actions +- [.github/workflows/sync-to-public.yml](.github/workflows/sync-to-public.yml) - Automated sync to public repo (orphan push) +- [.github/workflows/shadow-tester.yml](.github/workflows/shadow-tester.yml) - Shadow CI for student PRs +- [.github/workflows/deploy.yml](.github/workflows/deploy.yml) - Production deployment to ristoffer.ch + +### Configuration +- [next.config.ts](next.config.ts) - Next.js config with dynamic routing for staging +- [pytest.ini](pytest.ini) - pytest markers configuration +- [tailwind.config.ts](tailwind.config.ts) - Tailwind CSS configuration + +### Scripts +- [scripts/dev_utils.py](scripts/dev_utils.py) - Solution management (inject/reset/verify-clean) +- [scripts/sanitize.sh](scripts/sanitize.sh) - Complete sanitization pipeline +- [scripts/_sanitize_todos.py](scripts/_sanitize_todos.py) - TODO comment sanitizer + +### Claude Code Skills +- [.claude/skills/](/.claude/skills/) - All skill definitions +- [.claude/skills/README.md](.claude/skills/README.md) - Comprehensive skills documentation +- [.claude/commands/](.claude/commands/) - Command shortcuts + +### Components +- [components/audit/AuditLayout.tsx](components/audit/AuditLayout.tsx) - Primary wrapper for rendered textbook chapters + +### Testing +- [tests/conftest.py](tests/conftest.py) - pytest fixtures (auto-inject for internal tests) +- [tests/public/](tests/public/) - Student-visible tests +- [tests/internal/](tests/internal/) - Internal grading tests + +### Documentation +- [INSTRUCTOR.md](INSTRUCTOR.md) - Complete instructor guide (consolidated) +- [SKILLS_COMPLETE.md](SKILLS_COMPLETE.md) - Skills implementation summary +- [REFACTOR_COMPLETE.md](REFACTOR_COMPLETE.md) - Repository hardening summary + +--- + +## Typical Workflows + +### Creating a New Assignment +```bash +# 1. Scaffold structure +/new-assignment + +# 2. Implement solutions +# Edit: private/solutions/scratch-3/model_solution.py + +# 3. Generate fixtures +/generate-fixtures + +# 4. Update spec +# Edit: content/course/assignments/scratch-3.mdx + +# 5. Test grading +/test-rigor + +# 6. Commit +git add . && git commit -m "feat: add scratch-3 assignment" + +# 7. Release +/release + +# 8. Verify +/sync-check +``` + +### Grading Student Work +```bash +# 1. List PRs +gh pr list --base staging + +# 2. Grade PR +/grade + +# 3. Review report +cat tests/internal/reports/grade-pr123.md + +# 4. Merge if approved +gh pr merge 123 --squash +``` + +### Pre-Release Checklist +```bash +# 1. Audit +/vla-guard + +# 2. Pre-flight (audit + sanitize) +/pre-flight + +# 3. Release +/release + +# 4. Verify +/sync-check +``` + +--- + +## Shadow CI + +Student PRs to the public repo trigger **Shadow CI** - hidden testing with internal grading suite: + +1. Student opens PR to `arpg/vla-foundations` (public) +2. Public `.github/workflows/vla-audit.yml` triggers `repository_dispatch` to private repo +3. Private `.github/workflows/shadow-tester.yml` runs: + - Fetches student code + - Injects solutions + - Runs internal tests + - Posts Pass/Fail comment on public PR (no details) +4. Instructor uses `/grade` for detailed feedback + +**Purpose**: Catch critical failures early without exposing grading logic. + +--- + +## Security Boundaries + +### NEVER Sync to Public +- `private/` directory (complete solutions) +- `tests/internal/` directory (grading tests) +- `scripts/dev_utils.py` (solution management) +- `scripts/sanitize.sh` (sanitization script) +- `scripts/_sanitize_todos.py` (helper script) +- `.claude/` directory (instructor automation) +- Files with `[SOLUTION]` markers + +### Multi-Layer Protection +1. **Pre-commit hook** - Blocks commits with `[SOLUTION]` in public files +2. **VLA Guard skill** - Scans for leaks before operations +3. **Sanitization pipeline** - Removes private content automatically +4. **Post-sanitization validation** - Fail-safe check in GitHub Actions +5. **Orphan push** - Breaks git history links +6. **Sync-check skill** - Verifies public repo after release + +--- + +## Requirements + +- **Node.js** 18+ +- **pnpm** 8+ +- **Python** 3.11+ +- **uv** (Python package manager): `curl -LsSf https://astral.sh/uv/install.sh | sh` +- **gh CLI** (for skills): `brew install gh && gh auth login` + +Python dependencies (managed by uv via `pyproject.toml`): +- pytest, pytest-html +- torch +- numpy + +--- + +## Support + +- **Instructor Guide**: [INSTRUCTOR.md](INSTRUCTOR.md) +- **Skills Documentation**: [.claude/skills/README.md](.claude/skills/README.md) +- **Public Repo**: https://github.com/arpg/vla-foundations +- **Course Website**: https://www.vlm-robotics.dev + +--- + +**Remember**: This is the private instructor repository. Always run `/vla-guard` before any public-facing operation. diff --git a/components/audit/AuditLayout.tsx b/components/audit/AuditLayout.tsx index a44c8163..71234d86 100644 --- a/components/audit/AuditLayout.tsx +++ b/components/audit/AuditLayout.tsx @@ -13,26 +13,55 @@ interface Chapter { interface AuditLayoutProps { children: ReactNode; chapters: Chapter[]; + isReviewMode?: boolean; + prNumber?: string; } -export function AuditLayout({ children, chapters }: AuditLayoutProps) { +export function AuditLayout({ children, chapters, isReviewMode = false, prNumber }: AuditLayoutProps) { return ( -
+
-
+
+ {/* Review Mode Banner */} + {isReviewMode && ( +
+
+
+ + + + +
+
+

+ 👁️ REVIEW MODE +

+

+ You are viewing a preview of this audit. This content is under review and not yet published. +

+ {prNumber && ( +

+ Preview from PR #{prNumber} +

+ )} +
+
+
+ )} +
{children}
-
diff --git a/content/course/assignments/capstone.mdx b/content/course/assignments/capstone.mdx index 43b7f82c..a267bc61 100644 --- a/content/course/assignments/capstone.mdx +++ b/content/course/assignments/capstone.mdx @@ -1,377 +1,95 @@ --- -title: 'Capstone Project: Textbook Contribution & Implementation' +title: 'The VLA Capstone: Engineering the Frontier' assignment: 3 -due: 'Week 16' -points: 300 +due: 'Finals Week' +points: 250 --- -
-

⚠️ DRAFT: NOT YET ASSIGNED

-

This assignment is still under review and subject to change. Do not begin work until this notice is removed.

-
+# The VLA Capstone: From Audit to Architecture -# Capstone Project: Textbook Contribution & Implementation +**Weight:** 25% of Final Grade +**Initial Project Specification Due:** Leading into the Architecture Lab (This Thursday). +**Mastery Deadline:** Finals Week. -## Objective +## The Philosophy: Audit, Implement, Extend -Make a substantive contribution to the VLA Foundations textbook by authoring technical content, implementing code, and presenting your work to the class. - -## Learning Goals - -- **Synthesize** knowledge from multiple research papers -- **Implement** a non-trivial VLA component or experiment -- **Communicate** technical concepts clearly in writing -- **Present** findings to a technical audience +In this course, we do not perform "re-implementations" for the sake of practice. The Capstone is a substantive contribution to the `vlm-robotics.dev` living textbook. You are expected to move from an auditor (Assignment 2) to an architect—identifying a bottleneck, proposing a delta, and proving it via implementation. ## Project Tracks -Choose **one** of the following tracks: - -### Track 1: Research Extension - -Extend an existing VLA paper with novel experiments or analysis. - -**Requirements:** -- Reproduce baseline results from a published paper -- Design and run new experiments that test an unexplored dimension -- Contribute a textbook section analyzing your findings - -**Example Projects:** -- "Does RT-2 generalize to novel object geometries?" - Test on CAD-generated objects -- "Scaling laws for VLA data diversity" - Ablate dataset composition -- "Failure modes of diffusion policies in cluttered scenes" - Systematic failure analysis +Choose **one** of the following tracks for your technical deep-dive: -### Track 2: Engineering Implementation +### Track 1: Research Extension (The "Delta" Track) +Extend an existing VLA paper with novel experiments. +- **Requirement:** Reproduce a baseline, then design experiments testing a specific "Initial Dissolve" (e.g., "Does RT-2 generalize to novel object geometries created in simulation?"). +- **Textbook Contribution:** A section analyzing your findings and the "Information Decay" observed. +### Track 2: Engineering Implementation (The "Systems" Track) Build a production-grade VLA component from scratch. +- **Requirement:** Implement a key technique (e.g., an optimized vision encoder for 50Hz control, a cross-embodiment training harness). +- **Textbook Contribution:** A technical "Implementation Gotchas" guide and practitioners' manual for your component. -**Requirements:** -- Implement a key VLA technique (encoder, policy, training pipeline) -- Write clean, documented, tested code -- Contribute a textbook section with implementation details - -**Example Projects:** -- "Efficient vision encoder for real-time robotic control" - Optimized transformer -- "Multi-task policy training framework" - PyTorch training harness -- "Sim-to-real transfer toolkit" - Domain randomization + evaluation suite - -### Track 3: Comprehensive Survey - -Write an authoritative survey of a VLA subtopic. - -**Requirements:** -- Read 15-20 papers in a focused area -- Identify trends, gaps, and open questions -- Contribute a textbook section synthesizing the literature - -**Example Projects:** -- "Data augmentation strategies for robotic learning" - Survey + taxonomy -- "Benchmarking protocols for manipulation tasks" - Analysis of evaluation methods -- "Foundation models for embodied AI: A critical review" - Strengths/weaknesses analysis - -## Deliverables - -### 1. Proposal (Week 8) - -**Submit**: 1-2 page proposal via pull request - -**Contents:** -- Track selection (Research/Engineering/Survey) -- Problem statement and motivation -- Planned approach and timeline -- Expected contribution to textbook - -**Grading**: Pass/Fail (instructor feedback provided) - -### 2. Textbook Chapter Contribution (Week 16) - -**Submit**: MDX file with written content - -**Requirements:** -- 2000-4000 words of technical writing -- LaTeX equations for mathematical formulations -- Code snippets (if applicable) -- References to relevant papers -- Fits cohesively into one of the 8 textbook chapters - -**Location**: `content/textbook/[chapter-name]/your-section.mdx` - -**Example Structure:** +### Track 3: Synthesis & Taxonomy (The "Survey" Track) +Write an authoritative survey of a VLA sub-domain. +- **Requirement:** Read 15-20 papers. Identify the "Lineage of Failure" and the scaling laws of the sub-topic. +- **Textbook Contribution:** A foundational chapter synthesizing the literature into a cohesive taxonomy. -```mdx --- -title: "3.5 Your Section Title" -chapter: 3 -subsection: 5 -author: "Your Name" ---- - -# 3.5 Your Section Title - -## Motivation - -Why does this topic matter? - -## Background - -What do readers need to know? - -## Method - -How does it work? (Include equations) - -## Results - -What did you find? (Include figures/tables) - -## Discussion - -What are the implications? - -## References - -[Numbered references] -``` - -### 3. Code Implementation (Weeks 12-16) - -**Required for Research & Engineering tracks** (optional for Survey track) - -**Submit**: Pull request with code - -**Requirements:** -- Clean, documented Python code -- README with setup instructions -- Example usage / demo script -- Unit tests (if applicable) - -**Location**: `code/capstone/your-project-name/` - -**Grading Criteria:** -- Code quality and organization (30%) -- Documentation and comments (30%) -- Functionality and correctness (40%) -### 4. Final Presentation (Week 16) +## Technical Requirements -**Format**: 15-minute presentation + 5-minute Q&A +### 1. The Architectural Delta +Your project must identify a specific bottleneck in a "Primary Paper." You are not parrots; you are auditors. If you choose Track 1 or 2, you must justify your architectural changes using the **Amazon Principle**: write a technical specification that proves why this change is necessary. -**Contents:** -1. Problem statement and motivation (2 min) -2. Approach and methodology (5 min) -3. Results and findings (5 min) -4. Textbook contribution overview (2 min) -5. Lessons learned and future work (1 min) +### 2. The Data Mix +You must explicitly define your data curation strategy: +- **Foundational Priors:** Which internet-scale weights (SigLIP, DINOv2) are you using? +- **Embodied Data:** Which subset of Open X-Embodiment or DROID are you sampling? +- **Synthetic Multiplication:** Are you using *MimicGen* or *RoboGen* to scale your seeds? -**Slides**: Submit PDF via pull request +### 3. Formalized Logic & Derivations +Your documentation must be grounded in $\LaTeX$. +- Derive your specific loss function $\mathcal{L}_{total}$. +- Define the state-space $S$ and the action-space $A$ (e.g., Delta-EE, Joint Velocities, or Latent Tokens). -## Timeline +### 4. Semantic Form +All MDX contributions must follow the **Semantic Line Break** rule (one sentence per line). This is mandatory for the PR review process. -| Week | Milestone | -|------|-----------| -| 8 | Proposal due | -| 10 | Progress check-in (office hours) | -| 12 | Draft textbook section (optional feedback) | -| 14 | Code implementation complete | -| 16 | Final presentation + all deliverables due | - -## Grading Rubric (300 points) - -| Component | Points | -|-----------|--------| -| **Textbook Contribution** | **150** | -| - Technical accuracy | 50 | -| - Writing clarity | 40 | -| - Integration with existing chapters | 30 | -| - References and citations | 30 | -| **Implementation / Code** | **100** | -| - Functionality | 40 | -| - Code quality | 30 | -| - Documentation | 30 | -| **Presentation** | **50** | -| - Content clarity | 20 | -| - Slide quality | 15 | -| - Q&A responses | 15 | -| **Total** | **300** | - -## Evaluation Criteria - -### Textbook Contribution - -**Excellent (90-100%)**: -- Novel insights or analysis -- Crystal-clear explanations -- Publication-quality figures and equations -- Comprehensive references - -**Good (80-89%)**: -- Accurate technical content -- Clear writing with minor issues -- Relevant figures and equations -- Adequate references - -**Acceptable (70-79%)**: -- Mostly accurate content -- Understandable but needs polish -- Basic figures/equations -- Some key references missing - -### Code Implementation - -**Excellent (90-100%)**: -- Production-ready code -- Comprehensive documentation -- Runs out-of-the-box -- Includes tests and examples - -**Good (80-89%)**: -- Functional code -- Adequate documentation -- Minor setup issues -- Basic examples - -**Acceptable (70-79%)**: -- Code works with effort -- Minimal documentation -- Requires debugging -- No examples - -### Presentation - -**Excellent (90-100%)**: -- Engaging and clear -- Well-structured slides -- Confident Q&A responses -- On time - -**Good (80-89%)**: -- Clear presentation -- Decent slides -- Handles most questions -- Slightly over/under time - -**Acceptable (70-79%)**: -- Understandable content -- Basic slides -- Struggles with some questions -- Noticeable timing issues - -## Submission Process - -### Proposal (Week 8) - -```bash -git checkout -b capstone-proposal-yourname -# Add file: content/course/proposals/yourname-proposal.md -git add content/course/proposals/yourname-proposal.md -git commit -m "Add capstone proposal: Your Name" -git push origin capstone-proposal-yourname -# Open PR to staging -``` - -### Final Submission (Week 16) - -```bash -git checkout -b capstone-final-yourname -# Add textbook section: content/textbook/[chapter]/your-section.mdx -# Add code (if applicable): code/capstone/your-project/ -# Add slides: presentations/yourname-final.pdf -git add . -git commit -m "Add capstone project: Your Title" -git push origin capstone-final-yourname -# Open PR to staging -``` - -## Example Projects from Past Semesters - -### Research Track - -**"Generalization of RT-2 to Novel Objects"** (Jane Doe, 2025) -- Reproduced RT-2 baseline on Open-X dataset -- Generated 50 novel 3D objects with unseen geometries -- Found 23% performance drop on novel objects -- Contributed to Chapter 4 (Evaluation) - -**"Data Augmentation for Robotic Grasping"** (John Smith, 2025) -- Implemented 8 augmentation strategies -- Trained policies with systematic ablations -- Identified that rotation augmentation improves generalization by 15% -- Contributed to Chapter 2 (Data) - -### Engineering Track - -**"Real-Time Vision Encoder for Edge Deployment"** (Alice Johnson, 2025) -- Implemented MobileViT-based encoder -- Achieved 30 FPS on Jetson Orin -- Only 5% accuracy drop vs. ViT-B -- Contributed to Chapter 5 (Deployment) - -**"Multi-Task Policy Training Framework"** (Bob Williams, 2025) -- Built PyTorch training harness for 10+ tasks -- Supports multi-GPU, checkpointing, logging -- Open-sourced with 500+ GitHub stars -- Contributed to Chapter 3 (Training) - -### Survey Track - -**"Benchmarking Protocols for Manipulation"** (Carol Lee, 2025) -- Analyzed 30 papers on manipulation benchmarks -- Created taxonomy of evaluation metrics -- Identified reproducibility issues in 60% of papers -- Contributed to Chapter 4 (Evaluation) - -**"Foundation Models for Embodied AI: A Survey"** (David Chen, 2025) -- Surveyed 40 papers on VLMs for robotics -- Mapped landscape of architectures and datasets -- Identified key open problems -- Contributed to Chapter 7 (Future Directions) - -## Resources - -### Writing - -- [How to Write a Great Research Paper](https://www.microsoft.com/en-us/research/academic-program/write-great-research-paper/) -- [LaTeX Math Symbols](https://www.overleaf.com/learn/latex/List_of_Greek_letters_and_math_symbols) -- [MDX Documentation](https://mdxjs.com/) - -### Code - -- [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html) -- [PyTorch Best Practices](https://pytorch.org/tutorials/beginner/saving_loading_models.html) -- [Writing Good Documentation](https://www.writethedocs.org/guide/writing/beginners-guide-to-docs/) - -### Presentation - -- [How to Give a Great Research Talk](https://www.microsoft.com/en-us/research/academic-program/give-great-research-talk/) -- [Presentation Tips](https://www.cs.cmu.edu/~wloescher/presentations.html) +--- -## FAQs +## Team Structure: The $2\times$ Rule -**Q: Can I work in a team?** -A: No, capstone projects must be individual work. However, you can discuss ideas with classmates. +- **Individual Work:** The baseline for a high-quality contribution. +- **Group Work (Optional):** If you choose to work in a group, the technical bar for "Mastery" scales linearly. A 2-person team must go **$2\times$ as far**—meaning significantly larger data mixes, more robust baseline comparisons, or cross-embodiment evaluation. +- **Note:** Groups must provide a "Team Contribution Statement" in their proposal. -**Q: Can I extend my paper audit into a capstone?** -A: Yes! If you found an interesting research question during a paper audit, you can explore it further. +--- -**Q: What if my code doesn't work perfectly?** -A: Document what worked, what didn't, and why. Partial results are acceptable if well-analyzed. +## Deliverables & Grading Rubric (250 Points Total) -**Q: Can I contribute to multiple textbook chapters?** -A: Focus on one cohesive section. Quality over quantity. +### 1. Project Specification / Proposal (Pass/Fail - First Architectural Lab) +Submit via the **VLA Architecture Lab Form**. Includes team members, the "Initial Dissolve," and compute/data requirements. -**Q: What if my project scope changes?** -A: Discuss with the instructor. Pivots are allowed with justification. +### 2. Textbook Chapter Contribution (100 Points) +- **Technical Accuracy & Rigor (50 pts):** Correct $\LaTeX$, sound mathematical derivations, and deep critique. +- **Writing & Insights (50 pts):** Must include *Lineage of Failure*, *Intuitive Derivations*, and *Implementation Gotchas*. -## Getting Help +### 3. Code Implementation (75 Points) +- **Functionality & Correctness (50 pts):** Does it solve the stated bottleneck? +- **Code Quality & Docs (25 pts):** Clean Python, README with setup, and unit tests. -- **Office Hours**: Every Tuesday/Thursday 3-4 PM -- **Discussion Forum**: Post questions and get peer feedback -- **Mid-Project Check-In**: Schedule a meeting in Week 12 +### 4. Final Presentation (75 Points) +- **Content Density (50 pts):** 15-minute technical brief. +- **Q&A Rigor (25 pts):** Ability to defend your load-bearing assertions. -## Final Notes +--- -The capstone is your opportunity to make a lasting contribution to the VLA research community. Past student projects have been cited in papers, used by other researchers, and featured in the textbook for future cohorts. +## Submission Process: The PR Workflow -**Aim for work you'd be proud to showcase in a job interview or PhD application.** +1. **Branching:** `git checkout -b project/your-handle-topic` +2. **Pathing:** - **Textbook:** `content/textbook/[chapter]/your-section.mdx` + - **Code:** `code/capstone/your-project/` + - **Slides:** `presentations/your-name-final.pdf` +3. **The Loop:** Open a PR to `staging`. A bot will provide a preview link. Iterate until your project reaches **Level 3 (Mastery)** and is merged into the `main` textbook. -Good luck! +> **Final Note:** The capstone is your opportunity to make a lasting contribution to the VLA research community. Aim for work you would be proud to showcase in an AI Engineering interview. diff --git a/content/course/assignments/scratch-1.mdx b/content/course/assignments/scratch-1.mdx index 2b1f2bce..cdb5636d 100644 --- a/content/course/assignments/scratch-1.mdx +++ b/content/course/assignments/scratch-1.mdx @@ -5,11 +5,6 @@ due: 'Sunday, February 1, 9:00 AM MST' points: 100 --- -
-

⚠️ DRAFT: NOT YET ASSIGNED

-

This assignment is still under review and subject to change. Do not begin work until this notice is removed.

-
- # Scratch-1: The Transformer Backbone **Focus**: Implementing the $O(1)$ engine of the VLA stack. @@ -187,7 +182,9 @@ When I removed the causal mask, the following happened: ### Pass Level (B): 70-89 points - ✅ Successful implementation of the backbone -- ✅ Loss converges on the synthetic dataset (< 1.0) +- ✅ Loss shows clear convergence (appreciable decrease from initial loss) + - Expected: Initial loss ~3-4, Final loss ~1.9-2.2 + - Model should demonstrate learning, not achieve arbitrary threshold - ✅ Attention maps visualization included - ✅ Causal mask audit completed - ✅ Code is clean and documented @@ -307,6 +304,13 @@ A: Check: 2. Is the learning rate too high? (Try 1e-4) 3. Are gradients exploding? (Enable gradient clipping) +**Q: What loss should I expect?** +A: With correct implementation on the synthetic trajectory dataset: +- **Initial loss**: ~3-4 (near random guessing for 256-way classification) +- **Final loss**: ~1.9-2.2 (showing clear learning) +- **Key metric**: Appreciable decrease indicating the model learns patterns +- The action encoding represents direction + magnitude toward target, which is learnable but not trivial + ## 11. Deadline **Due**: Sunday, February 1, 9:00 AM MST diff --git a/content/course/submissions/scratch-1/Soorej S Nair.mdx b/content/course/submissions/scratch-1/Soorej S Nair.mdx new file mode 100644 index 00000000..9b291ef7 --- /dev/null +++ b/content/course/submissions/scratch-1/Soorej S Nair.mdx @@ -0,0 +1,133 @@ +--- +title: "Scratch-1 Submission: Soorej S Nair" +student: "Soorej S Nair" +date: "2026-02-03" +--- + +# Scratch-1: The Transformer Backbone + +## Loss Curve + +![Training Loss](./images/loss_curve.png) + +The model converged after around 10 iterations with final loss of 1.9277. + +## Attention Visualization + +![Attention Maps](./images/attention_maps.png) + +The attention patterns show the causal mask has worked. The dark upper right triangle does not receive any attention, this demonstrates that because of the causal mask, the future tokens are masked and are not passed into the model. The bright diagonal shows that the model focuses most on the latest token in the input sequence. The attention reduces as we go away from the recent tokens, as expected. + +### Sample output +``` +Using device: cpu +Epoch 1 Batch 100: avg_loss = 4.142927 +Epoch 1 Batch 200: avg_loss = 3.567831 +Epoch 1/20 - Loss: 3.2584 +Epoch 2 Batch 100: avg_loss = 2.299740 +Epoch 2 Batch 200: avg_loss = 2.243028 +Epoch 2/20 - Loss: 2.2111 +Epoch 3 Batch 100: avg_loss = 2.099218 +Epoch 3 Batch 200: avg_loss = 2.090066 +Epoch 3/20 - Loss: 2.0819 +Epoch 4 Batch 100: avg_loss = 2.046171 +Epoch 4 Batch 200: avg_loss = 2.041124 +Epoch 4/20 - Loss: 2.0388 +Epoch 5 Batch 100: avg_loss = 2.021413 +Epoch 5 Batch 200: avg_loss = 2.018425 +Epoch 5/20 - Loss: 2.0147 +Epoch 6 Batch 100: avg_loss = 2.001702 +Epoch 6 Batch 200: avg_loss = 2.000735 +Epoch 6/20 - Loss: 1.9991 +Epoch 7 Batch 100: avg_loss = 1.989224 +Epoch 7 Batch 200: avg_loss = 1.988070 +Epoch 7/20 - Loss: 1.9868 +Epoch 8 Batch 100: avg_loss = 1.981476 +Epoch 8 Batch 200: avg_loss = 1.981199 +Epoch 8/20 - Loss: 1.9792 +Epoch 9 Batch 100: avg_loss = 1.971384 +Epoch 9 Batch 200: avg_loss = 1.971669 +Epoch 9/20 - Loss: 1.9706 +Epoch 10 Batch 100: avg_loss = 1.963031 +Epoch 10 Batch 200: avg_loss = 1.963841 +Epoch 10/20 - Loss: 1.9645 +Epoch 11 Batch 100: avg_loss = 1.956031 +Epoch 11 Batch 200: avg_loss = 1.958478 +Epoch 11/20 - Loss: 1.9595 +Epoch 12 Batch 100: avg_loss = 1.950375 +Epoch 12 Batch 200: avg_loss = 1.952415 +Epoch 12/20 - Loss: 1.9539 +Epoch 13 Batch 100: avg_loss = 1.945426 +Epoch 13 Batch 200: avg_loss = 1.948742 +Epoch 13/20 - Loss: 1.9498 +Epoch 14 Batch 100: avg_loss = 1.942955 +Epoch 14 Batch 200: avg_loss = 1.944969 +Epoch 14/20 - Loss: 1.9462 +Epoch 15 Batch 100: avg_loss = 1.941023 +Epoch 15 Batch 200: avg_loss = 1.941831 +Epoch 15/20 - Loss: 1.9429 +Epoch 16 Batch 100: avg_loss = 1.936545 +Epoch 16 Batch 200: avg_loss = 1.938385 +Epoch 16/20 - Loss: 1.9388 +Epoch 17 Batch 100: avg_loss = 1.931708 +Epoch 17 Batch 200: avg_loss = 1.934831 +Epoch 17/20 - Loss: 1.9356 +Epoch 18 Batch 100: avg_loss = 1.929214 +Epoch 18 Batch 200: avg_loss = 1.931722 +Epoch 18/20 - Loss: 1.9326 +Epoch 19 Batch 100: avg_loss = 1.926539 +Epoch 19 Batch 200: avg_loss = 1.928762 +Epoch 19/20 - Loss: 1.9301 +Epoch 20 Batch 100: avg_loss = 1.923337 +Epoch 20 Batch 200: avg_loss = 1.926616 +Epoch 20/20 - Loss: 1.9277 +Saved loss curve to /Users/soorejsnair/Documents/code/Course Folder/Spring 26/CSCI 7000/vla-foundations/content/course/submissions/scratch-1/images/loss_curve.png +Saved attention heatmap to /Users/soorejsnair/Documents/code/Course Folder/Spring 26/CSCI 7000/vla-foundations/content/course/submissions/scratch-1/images/attention_maps.png +``` +## The Audit: Removing the Causal Mask + +When I removed the causal mask, the following happened: + +* Loss observed +![Training Loss](./images/loss_curve_without_causal_mask.png) +We see that the loss converges quickly with a final loss of 0.0460 which is much lower than the one with the causal mask. This is because of data leak. Since we do not use causal mask the model uses the future tokens to train the model and causes accuracy to skyrocket since it is predicting tokens it is already trained on. + + +* Attention Map +![Attention Maps](./images/attention_maps_without_causal_mask.png) +The attention map also says the same thing, instead of the lower triangle alone, the attention is now spread over all the tokens (including the future tokens) + +### Why the Model "Cheats" + +* This behaviour shows that, the model makes a prediction using the future rokens, which is unlike the expected behavious of autoregressive models. The model knowing the future token, training on it and then making a prediction with very low loss is considered "cheating" + + +## Why RoPE Is Superior to Sinusoidal Positional Embeddings + +In standard sinusoidal positional embeddings, absolute position vectors are added directly to token embeddings before attention is computed. This causes the dot-product attention score to entangle content and absolute position, making the model sensitive to when a pattern occurs rather than how tokens relate to each other. + +Rotary Positional Embeddings (RoPE) instead apply a position-dependent rotation to the query and key vectors. This transforms the attention score as: + +$ (R(i)q_i)^\top (R(j)k_j) = q_i^\top R(j - i) k_j $ + + +This formula shows that attention depends explicitly on the relative difference between the tokens rather than on their absolute positions. For spatial and trajectory data, where motion patterns are translation-invariant, this is a more appropriate inductive bias. + +### Ablation: RoPE vs Sinusoidal Embeddings + +We trained two identical models differing only in their positional encoding scheme. The RoPE-based model converged faster and achieved lower final loss (1.9277) compared to the sinusoidal baseline(1.9858). This confirms that relative positional encoding improves learning efficiency and generalization for spatial robot trajectories. + +![Training Loss](./images/loss_curve_sinusoidal.png) + +## Code Highlights + +* **Configurable Positional Encoding (RoPE vs. Sinusoidal).** + * The implementation supports both Rotary Positional Embeddings (RoPE) and sinusoidal positional embeddings using the global variable SINUSOIDAL flag. When SINUSOIDAL is False, RoPE embeddings are used and when SINUSOIDAL = True, absolute sinusoidal embeddings are passed to the model. + +* **KV caching for efficient inference.** + + * KV caching reduces inference complexity from O(T²) to O(T) by reusing key/value tensors across decoding steps. Empirically, generation latency per token drops significantly after the first token. + +* **Causal mask control** + * Causal masking is implemented explicitly using a lower-triangular attention mask and can be toggled via the CAUSAL_MASKING flag. This allowed direct inspection of the “causal cheating” behavior when the mask is disabled, making the effect of information leakage easy to observe both in loss curves and attention visualizations. + diff --git a/content/course/submissions/scratch-1/images/attention_maps.png b/content/course/submissions/scratch-1/images/attention_maps.png new file mode 100644 index 00000000..3915b39e Binary files /dev/null and b/content/course/submissions/scratch-1/images/attention_maps.png differ diff --git a/content/course/submissions/scratch-1/images/attention_maps_sinusoidal.png b/content/course/submissions/scratch-1/images/attention_maps_sinusoidal.png new file mode 100644 index 00000000..60711548 Binary files /dev/null and b/content/course/submissions/scratch-1/images/attention_maps_sinusoidal.png differ diff --git a/content/course/submissions/scratch-1/images/attention_maps_without_causal_mask.png b/content/course/submissions/scratch-1/images/attention_maps_without_causal_mask.png new file mode 100644 index 00000000..b7ca6c1e Binary files /dev/null and b/content/course/submissions/scratch-1/images/attention_maps_without_causal_mask.png differ diff --git a/content/course/submissions/scratch-1/images/loss_curve.png b/content/course/submissions/scratch-1/images/loss_curve.png new file mode 100644 index 00000000..1a1780c4 Binary files /dev/null and b/content/course/submissions/scratch-1/images/loss_curve.png differ diff --git a/content/course/submissions/scratch-1/images/loss_curve_sinusoidal.png b/content/course/submissions/scratch-1/images/loss_curve_sinusoidal.png new file mode 100644 index 00000000..46777ba2 Binary files /dev/null and b/content/course/submissions/scratch-1/images/loss_curve_sinusoidal.png differ diff --git a/content/course/submissions/scratch-1/images/loss_curve_without_causal_mask.png b/content/course/submissions/scratch-1/images/loss_curve_without_causal_mask.png new file mode 100644 index 00000000..6eb5dac8 Binary files /dev/null and b/content/course/submissions/scratch-1/images/loss_curve_without_causal_mask.png differ diff --git a/data/trajectories.pkl b/data/trajectories.pkl new file mode 100644 index 00000000..d5b55493 Binary files /dev/null and b/data/trajectories.pkl differ diff --git a/grading_reports/GRADING_REPORT.md b/grading_reports/GRADING_REPORT.md new file mode 100644 index 00000000..ad99928c --- /dev/null +++ b/grading_reports/GRADING_REPORT.md @@ -0,0 +1,65 @@ +![Chris-Bot](~/chris_robot.png) +### 🤖 Chris's Grading Assistant - Feedback Report + +**Student:** @Soorej30 +**PR:** #45 +**Branch:** `scratch-1-Soorej30` + +Hi! I've reviewed your submission. Here's what I found: + +--- + +## 📊 Component Feedback + +### ✅ Causal Self-Attention + +✅ Perfect! Your causal mask correctly prevents future token leakage. + +✅ Test passed. + +### ✅ RMSNorm + +✅ RMSNorm implemented correctly with proper normalization and learnable scale. + +✅ Test passed. + +### ✅ Training Loop + +✅ Excellent! Your model trains successfully and loss converges. + +### ✅ RoPE Embeddings + +✅ RoPE correctly applied to Q and K tensors. + +### ✅ Model Architecture + +✅ Model forward pass works end-to-end with correct output shapes. + +✅ Model has the expected number of trainable parameters. + +### ✅ Code Quality + +Your code imports and runs cleanly. Nice! ✨ + +--- + +## 📝 Documentation & Analysis + +✅ Report submitted! I found: +- `content/course/submissions/scratch-1/Soorej S Nair.mdx` +- `README.md` + +Your instructor will review the quality of your analysis. + +--- + +## 🎯 Mastery Features Detected + +I noticed you implemented: +- RoPE vs Sinusoidal ablation study + +Great work going beyond the requirements! Your instructor will verify implementation quality. + +--- + +> *Grading is automated but reviewed by an instructor. If you have questions, reach out on Slack!* diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..fa8661a8 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,55 @@ +[project] +name = "vla-foundations" +version = "0.1.0" +description = "VLA Foundations Course - Private Instructor Repository" +readme = "README.md" +requires-python = ">=3.10,<3.14" +dependencies = [ + "torch>=2.0.0", + "torchvision", + "numpy>=1.24.0", + "pytest>=7.0.0", + "pytest-html>=4.0.0", + "matplotlib>=3.5.0", + "pandas>=2.0.0", +] + +[[tool.uv.index]] +name = "pytorch-cpu" +url = "https://download.pytorch.org/whl/cpu" +explicit = true + +[[tool.uv.index]] +name = "pytorch-cu118" +url = "https://download.pytorch.org/whl/cu118" +explicit = true + +[tool.uv.sources] +torch = [ + { index = "pytorch-cpu", marker = "sys_platform == 'darwin'" }, + { index = "pytorch-cu118", marker = "sys_platform == 'linux'" } +] +torchvision = [ + { index = "pytorch-cpu", marker = "sys_platform == 'darwin'" }, + { index = "pytorch-cu118", marker = "sys_platform == 'linux'" } +] + +[tool.hatch.build.targets.wheel] +packages = [] + +[tool.pytest.ini_options] +markers = [ + "internal: internal grading tests (never public)", + "rigor: rigorous grading tests", + "gradient: gradient flow tests", + "fidelity: output comparison tests", + "training: training convergence tests", + "mastery: optional mastery-level features (DINOv2, KV-cache, etc.)", +] +testpaths = ["tests"] +python_files = ["test_*.py"] +python_classes = ["Test*"] +python_functions = ["test_*"] + +[dependency-groups] +dev = [] diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 00000000..ea7a96da --- /dev/null +++ b/pytest.ini @@ -0,0 +1,13 @@ +[pytest] +testpaths = tests +python_files = test_*.py +python_classes = Test* +python_functions = test_* +markers = + public: Tests that students can see and run + internal: Internal grading tests (never public) + rigor: Rigorous validation tests for grading + gradient: Tests for gradient flow validation + fidelity: Tests for output quality validation + training: Tests for training convergence + mastery: Optional mastery-level features (DINOv2, KV-cache, etc.) diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 00000000..0e328843 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,49 @@ +# CI/CD Scripts + +**Critical infrastructure scripts** used in GitHub Actions workflows. + +## Contents + +### Production Scripts + +- **`manage_solutions.py`** - Inject/reset assignment solutions (used in testing) +- **`sanitize.sh`** - Main sanitization pipeline for public sync +- **`_sanitize_todos.py`** - Remove solution hints from code +- **`audit_linter.py`** - Validate paper audit MDX files + +### Usage in CI/CD + +| Script | Workflow | Purpose | +|--------|----------|---------| +| `audit_linter.py` | `vla-audit.yml` | Validate audit frontmatter | +| `sanitize.sh` | `sync-to-public.yml` | Remove private content | +| `_sanitize_todos.py` | `sync-to-public.yml` | Strip solution hints | +| `manage_solutions.py` | (local testing) | Inject/reset solutions | + +### Critical Requirements + +1. **Fail-Safe**: All scripts must return non-zero exit codes on failure +2. **Idempotent**: Can be run multiple times safely +3. **Validated**: Must pass linting before sync +4. **Documented**: Clear error messages and usage + +## Development Scripts + +Local development helpers are in `scripts/dev/`. These are **not** used in CI/CD. + +## Modification Guidelines + +Changes to scripts in this directory affect production workflows. Always: + +1. Test locally first +2. Verify exit codes +3. Check GitHub Actions logs +4. Update documentation + +## Security + +These scripts handle sensitive operations: +- `sanitize.sh` - Removes private content before public sync +- `manage_solutions.py` - Manages private solutions + +Never commit secrets or tokens to these scripts. diff --git a/scripts/audit_linter.py b/scripts/audit_linter.py index 8ecfa1b8..ff1952a6 100755 --- a/scripts/audit_linter.py +++ b/scripts/audit_linter.py @@ -32,6 +32,62 @@ def check_semantic_breaks(file_path): ) return errors +def validate_frontmatter(file_path, content, lines): + """Validate YAML frontmatter contains required fields.""" + errors = [] + + # Extract frontmatter + if not content.startswith('---'): + errors.append( + f"{file_path}: Missing YAML frontmatter. File must start with '---' followed by " + "title, author, paper, and topic fields." + ) + return errors + + # Find the end of frontmatter + frontmatter_end = None + for i, line in enumerate(lines[1:], start=1): + if line.strip() == '---': + frontmatter_end = i + break + + if frontmatter_end is None: + errors.append( + f"{file_path}: Malformed YAML frontmatter. Missing closing '---'." + ) + return errors + + frontmatter_lines = lines[1:frontmatter_end] + frontmatter_text = '\n'.join(frontmatter_lines) + + # Required fields for audit MDX files + required_fields = ['title', 'author', 'topic', 'paper'] + + for field in required_fields: + # Check if field exists (case-insensitive) + if not any(line.strip().lower().startswith(f'{field}:') for line in frontmatter_lines): + errors.append( + f"{file_path}: Missing required frontmatter field: '{field}'" + ) + + # Validate field values are not empty + for line in frontmatter_lines: + stripped = line.strip() + if ':' in stripped: + field_name, field_value = stripped.split(':', 1) + field_name = field_name.strip().lower() + field_value = field_value.strip() + + if field_name in required_fields: + # Check for empty values or placeholder values + if not field_value or field_value in ['""', "''", 'null', 'TBD', 'TODO']: + errors.append( + f"{file_path}: Empty or placeholder value for required field: '{field_name}'" + ) + + return errors + + def check_mdx_syntax(file_path): """Check for MDX-specific syntax issues.""" with open(file_path, 'r', encoding='utf-8') as f: @@ -40,12 +96,8 @@ def check_mdx_syntax(file_path): errors = [] - # Check 1: Must have YAML frontmatter at the start - if not content.startswith('---'): - errors.append( - f"{file_path}: Missing YAML frontmatter. File must start with '---' followed by " - "title, author, paper, and topic fields." - ) + # Check 1: Validate frontmatter fields + errors.extend(validate_frontmatter(file_path, content, lines)) # Check 2: No HTML comments (should use JSX-style {/* */}) if '