diff --git a/.github/workflows/deploy-staging.yml b/.github/workflows/deploy-staging.yml
index 756386ac..a7330ef6 100644
--- a/.github/workflows/deploy-staging.yml
+++ b/.github/workflows/deploy-staging.yml
@@ -3,6 +3,18 @@ name: Deploy Staging Branch
 on:
   push:
     branches: [ staging ]
+    paths-ignore:
+      # Ignore changes that don't affect the site
+      - 'src/assignments/**'
+      - 'scripts/**'
+      - 'tests/**'
+      - 'private/**'
+      - '**.md'
+      - '.github/**'
+      - 'data/**'
+      - 'pyproject.toml'
+      - 'pytest.ini'
+      - 'uv.lock'
 
 jobs:
   deploy:
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
index f3e353a5..da3d9bda 100644
--- a/.github/workflows/deploy.yml
+++ b/.github/workflows/deploy.yml
@@ -3,6 +3,18 @@ name: Deploy to Production
 on:
   push:
     branches: [ main ]
+    paths-ignore:
+      # Ignore changes that don't affect the site
+      - 'src/assignments/**'
+      - 'scripts/**'
+      - 'tests/**'
+      - 'private/**'
+      - '**.md'
+      - '.github/**'
+      - 'data/**'
+      - 'pyproject.toml'
+      - 'pytest.ini'
+      - 'uv.lock'
 
 jobs:
   deploy:
@@ -11,7 +23,7 @@ jobs:
       - name: Deploy to Remote Server
         uses: appleboy/ssh-action@v1.0.3
         with:
-          host: ristoffer.ch
+          host: direct.ristoffer.ch
           username: crh
           key: ${{ secrets.SSH_DEPLOY_KEY }}
           script: |
diff --git a/.github/workflows/shadow-tester.yml b/.github/workflows/shadow-tester.yml
new file mode 100644
index 00000000..1fef1a47
--- /dev/null
+++ b/.github/workflows/shadow-tester.yml
@@ -0,0 +1,139 @@
+name: Shadow Tester
+
+on:
+  repository_dispatch:
+    types: [run-shadow-tests]
+
+permissions:
+  contents: read
+  pull-requests: write  # Needed to comment on PRs
+
+jobs:
+  shadow-test:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout Private Repo
+        uses: actions/checkout@v4
+        with:
+          repository: crheckman/private-vla-foundations
+          token: ${{ secrets.PRIVATE_REPO_TOKEN }}  # PAT with access to private repo
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+
+      - name: Install Python Dependencies
+        run: |
+          pip install pytest torch numpy
+
+      - name: Fetch Student Code from Public PR
+        env:
+          PR_NUMBER: ${{ github.event.client_payload.pr_number }}
+          HEAD_BRANCH: ${{ github.event.client_payload.head_branch }}
+          HEAD_SHA: ${{ github.event.client_payload.head_sha }}
+          REPO_URL: ${{ github.event.client_payload.repo_url }}
+        run: |
+          echo "Fetching student code from PR #${PR_NUMBER}"
+
+          # Clone the public repo
+          git clone https://github.com/arpg/vla-foundations.git /tmp/public-repo
+          cd /tmp/public-repo
+
+          # Fetch the PR branch
+          git fetch origin pull/${PR_NUMBER}/head:pr-${PR_NUMBER}
+          git checkout pr-${PR_NUMBER}
+
+          # Copy student code to our testing directory
+          # Copy src/assignments to the current repo
+          if [ -d "src/assignments" ]; then
+            cp -r src/assignments/* $GITHUB_WORKSPACE/src/assignments/ || true
+          fi
+
+          echo "Student code fetched successfully"
+
+      - name: Run Internal Rigorous Tests
+        id: tests
+        continue-on-error: true
+        run: |
+          # Run pytest with internal tests
+          pytest tests/internal/ -v --tb=short --maxfail=5 > test_output.txt 2>&1
+          TEST_EXIT_CODE=$?
+
+          # Capture output
+          cat test_output.txt
+
+          # Save exit code for later
+          echo "exit_code=${TEST_EXIT_CODE}" >> $GITHUB_OUTPUT
+
+          # Exit with the actual test result
+          exit $TEST_EXIT_CODE
+
+      - name: Prepare Test Summary
+        if: always()
+        id: summary
+        run: |
+          if [ -f test_output.txt ]; then
+            # Extract summary from pytest output
+            SUMMARY=$(tail -20 test_output.txt | grep -E "(PASSED|FAILED|ERROR)" || echo "Test execution completed")
+
+            # Escape newlines for GitHub output
+            SUMMARY="${SUMMARY//$'\n'/'%0A'}"
+            echo "summary=${SUMMARY}" >> $GITHUB_OUTPUT
+          else
+            echo "summary=No test output available" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Comment on Public PR - Pass
+        if: steps.tests.outcome == 'success'
+        uses: peter-evans/create-or-update-comment@v3
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}  # Default token works for same repo
+          repository: arpg/vla-foundations
+          issue-number: ${{ github.event.client_payload.pr_number }}
+          body: |
+            ## ✅ Shadow CI: Internal Tests Passed
+
+            Your submission passed all internal rigorous tests!
+
+            <details>
+            <summary>Test Summary</summary>
+
+            ```
+            ${{ steps.summary.outputs.summary }}
+            ```
+
+            </details>
+
+            ---
+            *These are hidden internal tests run by the instructor. Your code meets the required standards.*
+
+      - name: Comment on Public PR - Fail
+        if: steps.tests.outcome == 'failure'
+        uses: peter-evans/create-or-update-comment@v3
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}  # Default token works for same repo
+          repository: arpg/vla-foundations
+          issue-number: ${{ github.event.client_payload.pr_number }}
+          body: |
+            ## ❌ Shadow CI: Internal Tests Failed
+
+            Your submission did not pass all internal tests. Please review the feedback and make necessary corrections.
+
+            <details>
+            <summary>Test Summary</summary>
+
+            ```
+            ${{ steps.summary.outputs.summary }}
+            ```
+
+            </details>
+
+            ### Next Steps:
+            1. Review the test failures above
+            2. Make corrections to your code
+            3. Push updates to your PR branch
+            4. Tests will automatically re-run
+
+            ---
+            *These are hidden internal tests run by the instructor. Contact @crheckman if you need clarification on the failures.*
diff --git a/.github/workflows/vla-audit.yml b/.github/workflows/vla-audit.yml
index b8237a36..afa5a0d5 100644
--- a/.github/workflows/vla-audit.yml
+++ b/.github/workflows/vla-audit.yml
@@ -42,7 +42,15 @@ jobs:
 
             ### Common Issues:
 
-            **1. Semantic Line Breaks**
+            **1. Required Frontmatter Fields**
+            - Every audit MDX file must include these fields:
+              - `title`: Paper title
+              - `author`: Paper author(s)
+              - `topic`: Research topic/category
+              - `paper`: Link to paper or citation
+            - All fields must have non-empty values (no placeholders like "TBD" or "TODO")
+
+            **2. Semantic Line Breaks**
             - Each sentence should be on its own line
             - This makes PR commenting and reviewing much easier
             - Example:
@@ -53,7 +61,7 @@ jobs:
               + This makes PR review much easier.
               ```
 
-            **2. Clean Git History**
+            **3. Clean Git History**
             - No "Merge branch 'main'" commits allowed
             - Use `git rebase main` instead of `git merge main`
             - Keep your commit history linear and clean
@@ -144,22 +152,3 @@ jobs:
 
             ---
             *This preview will be removed when the PR is closed.*
-
-  trigger-shadow-tests:
-    runs-on: ubuntu-latest
-    if: github.event_name == 'pull_request' && github.base_ref == 'staging'
-    needs: audit
-    steps:
-      - name: Trigger Shadow CI in Private Repo
-        uses: peter-evans/repository-dispatch@v2
-        with:
-          token: ${{ secrets.PRIVATE_DISPATCH_TOKEN }}
-          repository: crheckman/private-vla-foundations
-          event-type: run-shadow-tests
-          client-payload: |
-            {
-              "pr_number": "${{ github.event.pull_request.number }}",
-              "head_branch": "${{ github.event.pull_request.head.ref }}",
-              "head_sha": "${{ github.event.pull_request.head.sha }}",
-              "repo_url": "${{ github.event.pull_request.head.repo.clone_url }}"
-            }
diff --git a/.gitignore b/.gitignore
index 3519214c..4eea2148 100644
--- a/.gitignore
+++ b/.gitignore
@@ -53,15 +53,13 @@ __pycache__/
 
 # project-specific
 /arxiv-digest/
-# Private repo files - do not commit to public
-private/
-tests/
-scripts/_sanitize_todos.py
-scripts/manage_solutions.py
-scripts/sanitize.sh
-scripts/setup_private_repo.sh
-scripts/add_github_secret.sh
-pytest.ini
-PRIVATE_REPO_SETUP.md
-SETUP_WITH_GH_CLI.md
-QUICK_REFERENCE.md
+
+# Private solution infrastructure (NEVER commit to public branches)
+*.backup.py
+
+# Claude Code skill outputs (generated reports)
+.claude/releases/*.md
+.claude/sync-reports/*.md
+!.claude/releases/.gitkeep
+!.claude/sync-reports/.gitkeep
+!tests/internal/reports/.gitkeep
diff --git a/README.md b/README.md
index 323ea9e7..f83a27ee 100644
--- a/README.md
+++ b/README.md
@@ -83,103 +83,6 @@ git push --force-with-lease
 
 ---
 
-## Repository Structure
-
-```
-vla-foundations/
-├── app/                           # Next.js App Router (web framework)
-│   ├── page.tsx                   # Landing page
-│   ├── textbook/[slug]/           # Dynamic chapter pages
-│   ├── course/                    # Course overview page
-│   │   └── assignments/[slug]/    # Dynamic assignment pages
-│   └── contributors/[slug]/       # Dynamic contributor profile pages
-│
-├── content/                       # All MDX content (rendered as web pages)
-│   ├── textbook/                  # 8-chapter VLA textbook
-│   │   ├── foundations/           # Chapter 0: Core concepts
-│   │   ├── architectures/         # Chapter 1: Model designs
-│   │   ├── data/                  # Chapter 2: Dataset construction
-│   │   ├── training/              # Chapter 3: Optimization methods
-│   │   ├── evaluation/            # Chapter 4: Metrics and benchmarks
-│   │   ├── deployment/            # Chapter 5: Production systems
-│   │   ├── applications/          # Chapter 6: Real-world use cases
-│   │   └── future/                # Chapter 7: Open problems
-│   │
-│   ├── course/                    # Course materials
-│   │   ├── Syllabus.mdx           # Course syllabus
-│   │   ├── assignments/           # Assignment specifications
-│   │   └── submissions/           # Student submission reports
-│   │
-│   └── contributors/              # Contributor profiles
-│       └── [github-handle].mdx    # One profile per contributor
-│
-└── src/                           # Executable source code
-    └── assignments/               # Assignment code templates
-        └── scratch-1/             # Example: Transformer implementation
-            ├── README.md          # Minimal README
-            ├── backbone.py        # Implementation template with TODOs
-            └── generate_data.py   # Dataset generator script
-```
-
----
-
-## The 8-Chapter Textbook
-
-0. **Foundations** - Core concepts and problem formulation
-1. **Architectures** - Model designs and network topologies
-2. **Data** - Dataset construction and curation strategies
-3. **Training** - Optimization and fine-tuning methods
-4. **Evaluation** - Metrics and benchmarking protocols
-5. **Deployment** - Production systems and scaling
-6. **Applications** - Real-world use cases and case studies
-7. **Future Directions** - Open problems and research frontiers
-
----
-
-## Development Workflow
-
-### Initial Setup
-
-```bash
-# Clone the repository
-git clone https://github.com/arpg/vla-foundations.git
-cd vla-foundations
-
-# Install dependencies
-pnpm install
-
-# Run development server
-pnpm dev
-```
-
-Navigate to `http://localhost:3000` to see the site.
-
-### Local Build
-
-```bash
-# Build the static site
-pnpm build
-
-# Preview the production build
-pnpm start
-```
-
----
-
-## Technologies
-
-### Core
-- **Next.js 16**: Static site generation
-- **TypeScript**: Type safety
-- **Tailwind CSS**: Styling
-- **MDX**: Markdown with JSX
-
-### Content Processing
-- **remark-math** + **rehype-katex**: LaTeX rendering
-- **remark-gfm**: GitHub-flavored Markdown
-
----
-
 ## Resources
 
 ### Documentation
diff --git a/claude.md b/claude.md
new file mode 100644
index 00000000..6837d90b
--- /dev/null
+++ b/claude.md
@@ -0,0 +1,644 @@
+# VLA Foundations Development Guide for AI SWE Agents (Private Repo)
+
+This is the **private instructor repository** for VLA Foundations, containing complete assignment solutions, internal grading tests, and instructor operations. The public student-facing repository is at `arpg/vla-foundations`. This repo uses **Next.js (App Router)** for the textbook, **Tailwind CSS** for styling, **MDX** for content, and **pnpm** for package management.
+
+Read more about the dual-repository architecture in [INSTRUCTOR.md](INSTRUCTOR.md).
+
+---
+
+## Repository Architecture
+
+This is a **two-repository system**:
+
+```
+Private Repo (crheckman/private-vla-foundations)
+├── private/                    # Complete assignment solutions (NEVER PUBLIC)
+│   └── solutions/
+├── tests/internal/             # Internal grading tests (NEVER PUBLIC)
+│   ├── fixtures/              # Gold standard test data
+│   └── reports/               # Grading reports (git-ignored)
+├── scripts/
+│   ├── dev_utils.py           # Solution management (inject/reset/verify-clean)
+│   ├── sanitize.sh            # Automated sanitization pipeline
+│   └── _sanitize_todos.py     # TODO comment sanitizer
+├── .claude/
+│   ├── skills/                # Claude Code skills for automation
+│   └── commands/              # Slash command shortcuts
+└── src/assignments/           # Starter code with [SOLUTION] hints
+
+                    ↓ (Orphan push on release tag)
+
+Public Repo (arpg/vla-foundations)
+├── src/assignments/           # Starter code (TODOs only)
+├── tests/public/              # Student-visible tests
+├── content/                   # Textbook and assignment specs
+└── [NO private/ or tests/internal/]
+```
+
+**Critical**: Never commit `private/` or `tests/internal/` to public branches.
+
+---
+
+## Initial Setup
+
+### Prerequisites
+```bash
+# Install dependencies
+pnpm install
+
+# Install uv (Python package manager) - REQUIRED
+curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# Install Python dependencies via uv
+uv sync
+
+# Install GitHub CLI (required for skills)
+brew install gh
+gh auth login
+```
+
+### Python Environment (uv)
+**All Python commands MUST use `uv run`** to ensure correct dependencies:
+```bash
+# Run Python scripts
+uv run python scripts/dev_utils.py --list
+
+# Run pytest
+uv run pytest tests/internal/ -v -m rigor
+
+# Run any Python file
+uv run python src/assignments/scratch-1/generate_data.py
+```
+
+### Development
+```bash
+# Run development server
+pnpm dev
+
+# Build production (static export in out/)
+pnpm build
+
+# Lint Next.js
+pnpm lint
+```
+
+---
+
+## Claude Code Skills (Automation)
+
+This repository has **7 Claude Code skills** for workflow automation. See [.claude/skills/README.md](.claude/skills/README.md) for complete documentation.
+
+### Core Skills
+
+#### `/vla-guard` - Solution Leak Audit
+**Purpose**: Prevent solution leaks before any public operation
+
+**Usage**:
+```bash
+/vla-guard
+```
+
+**What it does**:
+- Scans for `[SOLUTION]` markers in `src/` and `content/`
+- Verifies `private/` and `tests/internal/` not staged
+- Checks git history for accidental commits
+- Runs `dev_utils.py --verify-clean` (similarity detection)
+- **Blocks** sync if any check fails
+
+**When to use**: Before every push, PR, or release
+
+---
+
+#### `/test-rigor` - Internal Grading Tests
+**Purpose**: Run internal grading tests with automatic solution injection/reset
+
+**Usage**:
+```bash
+/test-rigor
+# Select: "Scratch-1" / "Scratch-2" / "All"
+```
+
+**What it does**:
+1. Injects solutions: `python3 scripts/dev_utils.py --inject <assignment>`
+2. Runs pytest: `pytest tests/internal/ -v -m rigor`
+3. Generates report: `tests/internal/reports/test-report-<timestamp>.txt`
+4. Resets to starter code: `python3 scripts/dev_utils.py --reset <assignment>`
+
+**Safe to run multiple times** - always resets after completion.
+
+---
+
+#### `/generate-fixtures` - Gold Standard Fixtures
+**Purpose**: Generate reference data for fidelity tests from solution code
+
+**Usage**:
+```bash
+/generate-fixtures
+# Select assignment
+```
+
+**What it does**:
+1. Injects solutions
+2. Sets fixed random seeds (seed=42)
+3. Runs solution code to generate outputs
+4. Saves to `tests/internal/fixtures/<assignment>/gold_output.pt`
+5. Verifies no NaNs
+6. Generates fixture documentation
+7. Resets to starter code
+
+**When to use**: After completing solution implementation or updating solution code
+
+---
+
+#### `/grade` - Automated PR Grading
+**Purpose**: Complete grading workflow for student pull requests
+
+**Usage**:
+```bash
+/grade
+# Enter PR number or auto-detect latest
+```
+
+**What it does**:
+1. Fetches student code from GitHub PR
+2. Runs VLA Guard on student code (detect plagiarism/leaks)
+3. Runs `tests/public/` (student-visible tests)
+4. Injects reference solution
+5. Runs `tests/internal/` (gradient leak, fidelity, training tests)
+6. Restores student code
+7. Generates detailed markdown feedback report
+8. Posts comment on PR (optional)
+9. Updates PR labels (ready-to-merge / needs-revision / changes-requested)
+
+**Output**: `tests/internal/reports/grade-pr<number>.md`
+
+**When to use**: When reviewing student submissions
+
+---
+
+#### `/release` - Safe Assignment Publishing
+**Purpose**: Orchestrate complete release workflow with comprehensive safety checks
+
+**Usage**:
+```bash
+/release
+# Select: "Scratch-1" / "Scratch-2" / etc.
+```
+
+**What it does**:
+1. Verifies on main branch, no uncommitted changes
+2. Runs `/vla-guard` pre-flight audit (fail-fast)
+3. Prompts for release tag (e.g., `release-scratch-2`)
+4. Shows changes since last release
+5. Runs `scripts/sanitize.sh` (removes private/, [SOLUTION] markers, etc.)
+6. Verifies sanitization (fail-safe)
+7. Creates annotated git tag
+8. Pushes tag → triggers `.github/workflows/sync-to-public.yml`
+9. Monitors GitHub Actions workflow execution
+10. Verifies public repository (no leaks)
+11. Checks deployment status (https://www.vlm-robotics.dev)
+12. Generates release summary: `.claude/releases/release-<tag>.md`
+
+**Fail-safe**: Aborts at ANY failed check, provides remediation instructions
+
+**When to use**: When ready to publish assignment to students
+
+---
+
+#### `/new-assignment` - Assignment Scaffolding
+**Purpose**: Create complete assignment structure with templates
+
+**Usage**:
+```bash
+/new-assignment
+# Enter name, type, focus, difficulty
+```
+
+**What it does**:
+1. Creates directory structure:
+   - `src/assignments/<name>/` (starter code with TODOs)
+   - `private/solutions/<name>/` (solution templates)
+   - `tests/public/test_<name>_basic.py` (student-visible tests)
+   - `tests/internal/test_<name>_rigor.py` (grading tests)
+   - `content/course/assignments/<name>.mdx` (assignment spec)
+2. Generates Python templates
+3. Generates test templates
+4. Creates README files
+
+**Next steps after scaffolding**:
+1. Complete solution implementations
+2. Run `/generate-fixtures`
+3. Update MDX spec
+4. Run `/test-rigor`
+5. Commit changes
+6. Run `/release`
+
+---
+
+#### `/sync-check` - Post-Release Verification
+**Purpose**: Verify public repository has no leaks after release sync
+
+**Usage**:
+```bash
+/sync-check
+# Select: "Latest" or specify release tag
+```
+
+**What it does**:
+1. Clones public repo to temp directory (read-only)
+2. Scans for `[SOLUTION]` markers
+3. Checks for private directories (`private/`, `tests/internal/`)
+4. Checks for private scripts (`dev_utils.py`, `sanitize.sh`)
+5. Checks for sensitive files (credentials, `*_solution.py`)
+6. Verifies orphan push strategy (no linked git history)
+7. Compares file lists (private vs public)
+8. Checks deployment status (HTTPS 200)
+9. Runs sample fidelity check
+10. Generates verification report: `.claude/sync-reports/sync-check-<timestamp>.md`
+11. Cleans up temp files
+
+**When to use**: Always run after `/release` completes
+
+**Critical**: If leaks detected, report provides urgent remediation steps
+
+---
+
+## Commands Useful in Development
+
+### Solution Management
+```bash
+# List all available solutions
+uv run python scripts/dev_utils.py --list
+
+# Inject solutions for testing/grading
+uv run python scripts/dev_utils.py --inject scratch-1
+
+# Reset to starter code
+uv run python scripts/dev_utils.py --reset scratch-1
+
+# Verify no solution leaks (similarity check)
+uv run python scripts/dev_utils.py --verify-clean
+```
+
+### Testing
+```bash
+# Run public tests (students can see these)
+uv run pytest tests/public/ -v
+
+# Run internal grading tests (after injecting solutions)
+uv run pytest tests/internal/ -v -m rigor
+
+# Run specific test file
+uv run pytest tests/internal/test_scratch1_rigor.py -v
+
+# Generate HTML report
+uv run pytest tests/internal/ --html=tests/internal/reports/report.html --self-contained-html
+```
+
+### Pre-Release Checks
+```bash
+# Complete pre-flight check
+/pre-flight
+
+# Or manually:
+uv run python scripts/dev_utils.py --verify-clean
+bash scripts/sanitize.sh  # (Only in orphan branch workflow)
+```
+
+### GitHub Operations
+```bash
+# List open student PRs
+gh pr list --base staging --state open
+
+# View PR details
+gh pr view 123
+
+# Comment on PR
+gh pr comment 123 --body "Feedback here"
+
+# Merge PR
+gh pr merge 123 --squash
+```
+
+---
+
+## Linting and Formatting
+
+### Semantic Line Breaks
+**All MDX files MUST use one sentence per line.** This is mandatory to allow granular, line-by-line feedback in Pull Requests.
+
+**Bad:**
+```markdown
+This is a very long sentence with multiple ideas. It continues on the same line. This makes PR review difficult.
+```
+
+**Good:**
+```markdown
+This is a sentence on its own line.
+Each idea gets its own line.
+This makes PR review much easier.
+```
+
+### LaTeX
+Use formal LaTeX for all mathematical derivations:
+```markdown
+The loss function is:
+$$
+\mathcal{L} = -\sum_{t=1}^T \log p(a_t | s_t, I_t)
+$$
+```
+
+Do not use code blocks for math.
+
+### Next.js Linting
+```bash
+pnpm lint
+```
+
+---
+
+## Testing Philosophy
+
+### Public Tests (`tests/public/`)
+**Purpose**: Student-visible validation tests
+
+**What they test**:
+- Basic model structure (initialization, shapes)
+- Forward pass correctness (no NaNs, correct dimensions)
+- Gradient flow (backpropagation works)
+
+**Students can run these**: `pytest tests/public/test_scratch1_basic.py -v`
+
+### Internal Tests (`tests/internal/`)
+**Purpose**: Rigorous grading tests (NEVER synced to public)
+
+**What they test**:
+- **Gradient Leak Test**: Verify frozen parameters (e.g., DINOv2 backbone)
+- **Latent Fidelity Test**: Compare output against gold standard fixtures
+- **Training Convergence Test**: Verify model can train and loss decreases
+- **Edge Case Tests**: Boundary conditions, error handling
+
+**Markers**:
+- `@pytest.mark.internal` - All internal tests
+- `@pytest.mark.rigor` - Strict grading tests
+- `@pytest.mark.gradient` - Gradient flow tests
+- `@pytest.mark.fidelity` - Output comparison tests
+- `@pytest.mark.training` - Training convergence tests
+
+**Run with**: `pytest tests/internal/ -v -m rigor`
+
+---
+
+## Interacting with the App
+
+### Local Development
+```bash
+pnpm dev
+# Access at http://localhost:3000
+```
+
+### Staging Previews
+Every Pull Request to `staging` branch triggers deployment to:
+```
+https://vlm-robotics.dev/staging/pulls/[PR_NUMBER]/
+```
+
+**Review Protocol**:
+1. Read the rendered audit on the staging site
+2. Comment on the **source MDX** in GitHub "Files Changed" tab
+3. Use the **Rich Diff** view in GitHub to verify LaTeX rendering
+
+### Production
+Production site deployed at:
+```
+https://www.vlm-robotics.dev
+```
+
+Deployment triggered by:
+- Push to `main` branch (after staging → main merge)
+- GitHub Action: `.github/workflows/deploy.yml`
+- Deploys to ristoffer.ch via SSH
+
+---
+
+## Patterns & Standards
+
+### Amazon Principle
+We do not write "summaries." We write rigorous, durable **Audits**. A high-fidelity audit IS the textbook chapter.
+
+### Textbook Audit Sidebars
+Every audit MUST contain these three technical sidebars:
+
+1. **The Lineage of Failure**: Why previous approaches died
+2. **Intuitive Derivation**: The geometric/mathematical intuition of the loss function
+3. **Implementation Gotchas**: Practitioners' notes on coordinate frames, normalization, or hyperparameters
+
+### The Interface Focus
+When auditing VLA models, focus on the **Interface**:
+- **Input Projection**: Pixels → Tokens
+- **Action Head**: Tokens → Trajectories
+- **The Loss/Objective Function**
+
+### Git Hygiene
+We are a **rebase-only** lab. Use `git rebase main`. PRs containing "Merge branch 'main'" commits will be closed.
+
+**Correct workflow**:
+```bash
+git fetch origin
+git rebase origin/main
+git push --force-with-lease
+```
+
+### Sanitization
+All private solutions are marked with `[SOLUTION]` tags:
+```python
+# TODO: Implement RMSNorm forward pass
+# [SOLUTION] Use torch.rsqrt for efficiency
+result = torch.rsqrt(variance + self.eps)
+```
+
+The sanitization pipeline:
+1. `scripts/_sanitize_todos.py` - Removes `[SOLUTION]` markers
+2. `scripts/sanitize.sh` - Orchestrates full cleanup (private dirs, scripts, README)
+3. Triggered automatically by `.github/workflows/sync-to-public.yml` on release tags
+
+**Load-bearing wall**: `scripts/sanitize.sh` is the primary defense against solution leaks.
+
+### Orphan Push Strategy
+When syncing to public repo, we use **orphan branches** to break all git history links:
+
+```bash
+git checkout --orphan temp-public-branch
+git add -A
+git commit -m "Public Release: $(date)"
+git push public temp-public-branch:main --force
+```
+
+**Benefits**:
+- No commit history from private repo exposed
+- Public repo has completely independent git history
+- Maximum security against accidental leaks via `git log`
+
+---
+
+## File Map of Interest
+
+### GitHub Actions
+- [.github/workflows/sync-to-public.yml](.github/workflows/sync-to-public.yml) - Automated sync to public repo (orphan push)
+- [.github/workflows/shadow-tester.yml](.github/workflows/shadow-tester.yml) - Shadow CI for student PRs
+- [.github/workflows/deploy.yml](.github/workflows/deploy.yml) - Production deployment to ristoffer.ch
+
+### Configuration
+- [next.config.ts](next.config.ts) - Next.js config with dynamic routing for staging
+- [pytest.ini](pytest.ini) - pytest markers configuration
+- [tailwind.config.ts](tailwind.config.ts) - Tailwind CSS configuration
+
+### Scripts
+- [scripts/dev_utils.py](scripts/dev_utils.py) - Solution management (inject/reset/verify-clean)
+- [scripts/sanitize.sh](scripts/sanitize.sh) - Complete sanitization pipeline
+- [scripts/_sanitize_todos.py](scripts/_sanitize_todos.py) - TODO comment sanitizer
+
+### Claude Code Skills
+- [.claude/skills/](/.claude/skills/) - All skill definitions
+- [.claude/skills/README.md](.claude/skills/README.md) - Comprehensive skills documentation
+- [.claude/commands/](.claude/commands/) - Command shortcuts
+
+### Components
+- [components/audit/AuditLayout.tsx](components/audit/AuditLayout.tsx) - Primary wrapper for rendered textbook chapters
+
+### Testing
+- [tests/conftest.py](tests/conftest.py) - pytest fixtures (auto-inject for internal tests)
+- [tests/public/](tests/public/) - Student-visible tests
+- [tests/internal/](tests/internal/) - Internal grading tests
+
+### Documentation
+- [INSTRUCTOR.md](INSTRUCTOR.md) - Complete instructor guide (consolidated)
+- [SKILLS_COMPLETE.md](SKILLS_COMPLETE.md) - Skills implementation summary
+- [REFACTOR_COMPLETE.md](REFACTOR_COMPLETE.md) - Repository hardening summary
+
+---
+
+## Typical Workflows
+
+### Creating a New Assignment
+```bash
+# 1. Scaffold structure
+/new-assignment
+
+# 2. Implement solutions
+# Edit: private/solutions/scratch-3/model_solution.py
+
+# 3. Generate fixtures
+/generate-fixtures
+
+# 4. Update spec
+# Edit: content/course/assignments/scratch-3.mdx
+
+# 5. Test grading
+/test-rigor
+
+# 6. Commit
+git add . && git commit -m "feat: add scratch-3 assignment"
+
+# 7. Release
+/release
+
+# 8. Verify
+/sync-check
+```
+
+### Grading Student Work
+```bash
+# 1. List PRs
+gh pr list --base staging
+
+# 2. Grade PR
+/grade
+
+# 3. Review report
+cat tests/internal/reports/grade-pr123.md
+
+# 4. Merge if approved
+gh pr merge 123 --squash
+```
+
+### Pre-Release Checklist
+```bash
+# 1. Audit
+/vla-guard
+
+# 2. Pre-flight (audit + sanitize)
+/pre-flight
+
+# 3. Release
+/release
+
+# 4. Verify
+/sync-check
+```
+
+---
+
+## Shadow CI
+
+Student PRs to the public repo trigger **Shadow CI** - hidden testing with internal grading suite:
+
+1. Student opens PR to `arpg/vla-foundations` (public)
+2. Public `.github/workflows/vla-audit.yml` triggers `repository_dispatch` to private repo
+3. Private `.github/workflows/shadow-tester.yml` runs:
+   - Fetches student code
+   - Injects solutions
+   - Runs internal tests
+   - Posts Pass/Fail comment on public PR (no details)
+4. Instructor uses `/grade` for detailed feedback
+
+**Purpose**: Catch critical failures early without exposing grading logic.
+
+---
+
+## Security Boundaries
+
+### NEVER Sync to Public
+- `private/` directory (complete solutions)
+- `tests/internal/` directory (grading tests)
+- `scripts/dev_utils.py` (solution management)
+- `scripts/sanitize.sh` (sanitization script)
+- `scripts/_sanitize_todos.py` (helper script)
+- `.claude/` directory (instructor automation)
+- Files with `[SOLUTION]` markers
+
+### Multi-Layer Protection
+1. **Pre-commit hook** - Blocks commits with `[SOLUTION]` in public files
+2. **VLA Guard skill** - Scans for leaks before operations
+3. **Sanitization pipeline** - Removes private content automatically
+4. **Post-sanitization validation** - Fail-safe check in GitHub Actions
+5. **Orphan push** - Breaks git history links
+6. **Sync-check skill** - Verifies public repo after release
+
+---
+
+## Requirements
+
+- **Node.js** 18+
+- **pnpm** 8+
+- **Python** 3.11+
+- **uv** (Python package manager): `curl -LsSf https://astral.sh/uv/install.sh | sh`
+- **gh CLI** (for skills): `brew install gh && gh auth login`
+
+Python dependencies (managed by uv via `pyproject.toml`):
+- pytest, pytest-html
+- torch
+- numpy
+
+---
+
+## Support
+
+- **Instructor Guide**: [INSTRUCTOR.md](INSTRUCTOR.md)
+- **Skills Documentation**: [.claude/skills/README.md](.claude/skills/README.md)
+- **Public Repo**: https://github.com/arpg/vla-foundations
+- **Course Website**: https://www.vlm-robotics.dev
+
+---
+
+**Remember**: This is the private instructor repository. Always run `/vla-guard` before any public-facing operation.
diff --git a/components/audit/AuditLayout.tsx b/components/audit/AuditLayout.tsx
index a44c8163..71234d86 100644
--- a/components/audit/AuditLayout.tsx
+++ b/components/audit/AuditLayout.tsx
@@ -13,26 +13,55 @@ interface Chapter {
 interface AuditLayoutProps {
   children: ReactNode;
   chapters: Chapter[];
+  isReviewMode?: boolean;
+  prNumber?: string;
 }
 
-export function AuditLayout({ children, chapters }: AuditLayoutProps) {
+export function AuditLayout({ children, chapters, isReviewMode = false, prNumber }: AuditLayoutProps) {
   return (
-    <div className="flex min-h-screen">
+    <div className="flex min-h-screen bg-gradient-to-br from-slate-50 to-slate-100">
       <Sidebar chapters={chapters} />
 
       <main className="flex-1 flex">
-        <article className="flex-1 max-w-4xl mx-auto px-8 py-12">
+        <article className="flex-1 max-w-5xl mx-auto px-8 sm:px-12 lg:px-16 py-12 bg-white shadow-sm">
+          {/* Review Mode Banner */}
+          {isReviewMode && (
+            <div className="mb-8 p-6 bg-gradient-to-r from-amber-50 to-yellow-50 border-2 border-amber-300 rounded-xl shadow-sm">
+              <div className="flex items-start gap-4">
+                <div className="flex-shrink-0">
+                  <svg className="w-6 h-6 text-amber-600" fill="none" viewBox="0 0 24 24" stroke="currentColor">
+                    <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M15 12a3 3 0 11-6 0 3 3 0 016 0z" />
+                    <path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M2.458 12C3.732 7.943 7.523 5 12 5c4.478 0 8.268 2.943 9.542 7-1.274 4.057-5.064 7-9.542 7-4.477 0-8.268-2.943-9.542-7z" />
+                  </svg>
+                </div>
+                <div className="flex-1">
+                  <h3 className="text-lg font-bold text-amber-900 mb-1">
+                    👁️ REVIEW MODE
+                  </h3>
+                  <p className="text-sm text-amber-800 mb-3">
+                    You are viewing a preview of this audit. This content is under review and not yet published.
+                  </p>
+                  {prNumber && (
+                    <p className="text-xs text-amber-700 font-mono bg-amber-100 px-3 py-1.5 rounded inline-block">
+                      Preview from PR #{prNumber}
+                    </p>
+                  )}
+                </div>
+              </div>
+            </div>
+          )}
+
           <div className="prose prose-lg prose-slate max-w-none">
             {children}
           </div>
         </article>
 
-        <aside className="hidden xl:block w-64 border-l border-gray-200 bg-gray-50 p-6 overflow-y-auto h-screen sticky top-0">
-          <div className="text-xs font-semibold text-gray-500 uppercase tracking-wide mb-3">
+        <aside className="hidden xl:block w-72 border-l border-slate-200 bg-gradient-to-b from-slate-50 to-white p-8 overflow-y-auto h-screen sticky top-0">
+          <div className="text-xs font-bold text-slate-500 uppercase tracking-wider mb-4 pb-2 border-b border-slate-200">
             On This Page
           </div>
-          <div className="text-sm text-gray-600">
-            <p className="text-xs italic">Table of contents</p>
+          <div className="text-sm text-slate-600">
+            <p className="text-xs italic text-slate-400">Table of contents</p>
           </div>
         </aside>
       </main>
diff --git a/content/course/assignments/capstone.mdx b/content/course/assignments/capstone.mdx
index 43b7f82c..a267bc61 100644
--- a/content/course/assignments/capstone.mdx
+++ b/content/course/assignments/capstone.mdx
@@ -1,377 +1,95 @@
 ---
-title: 'Capstone Project: Textbook Contribution & Implementation'
+title: 'The VLA Capstone: Engineering the Frontier'
 assignment: 3
-due: 'Week 16'
-points: 300
+due: 'Finals Week'
+points: 250
 ---
 
-<div className="bg-red-100 border-l-4 border-red-500 text-red-700 px-6 py-4 mb-8 rounded">
-  <p className="font-bold text-xl">⚠️ DRAFT: NOT YET ASSIGNED</p>
-  <p className="mt-2">This assignment is still under review and subject to change. Do not begin work until this notice is removed.</p>
-</div>
+# The VLA Capstone: From Audit to Architecture
 
-# Capstone Project: Textbook Contribution & Implementation
+**Weight:** 25% of Final Grade
+**Initial Project Specification Due:** Leading into the Architecture Lab (This Thursday).
+**Mastery Deadline:** Finals Week.
 
-## Objective
+## The Philosophy: Audit, Implement, Extend
 
-Make a substantive contribution to the VLA Foundations textbook by authoring technical content, implementing code, and presenting your work to the class.
-
-## Learning Goals
-
-- **Synthesize** knowledge from multiple research papers
-- **Implement** a non-trivial VLA component or experiment
-- **Communicate** technical concepts clearly in writing
-- **Present** findings to a technical audience
+In this course, we do not perform "re-implementations" for the sake of practice. The Capstone is a substantive contribution to the `vlm-robotics.dev` living textbook. You are expected to move from an auditor (Assignment 2) to an architect—identifying a bottleneck, proposing a delta, and proving it via implementation.
 
 ## Project Tracks
 
-Choose **one** of the following tracks:
-
-### Track 1: Research Extension
-
-Extend an existing VLA paper with novel experiments or analysis.
-
-**Requirements:**
-- Reproduce baseline results from a published paper
-- Design and run new experiments that test an unexplored dimension
-- Contribute a textbook section analyzing your findings
-
-**Example Projects:**
-- "Does RT-2 generalize to novel object geometries?" - Test on CAD-generated objects
-- "Scaling laws for VLA data diversity" - Ablate dataset composition
-- "Failure modes of diffusion policies in cluttered scenes" - Systematic failure analysis
+Choose **one** of the following tracks for your technical deep-dive:
 
-### Track 2: Engineering Implementation
+### Track 1: Research Extension (The "Delta" Track)
+Extend an existing VLA paper with novel experiments. 
+- **Requirement:** Reproduce a baseline, then design experiments testing a specific "Initial Dissolve" (e.g., "Does RT-2 generalize to novel object geometries created in simulation?").
+- **Textbook Contribution:** A section analyzing your findings and the "Information Decay" observed.
 
+### Track 2: Engineering Implementation (The "Systems" Track)
 Build a production-grade VLA component from scratch.
+- **Requirement:** Implement a key technique (e.g., an optimized vision encoder for 50Hz control, a cross-embodiment training harness).
+- **Textbook Contribution:** A technical "Implementation Gotchas" guide and practitioners' manual for your component.
 
-**Requirements:**
-- Implement a key VLA technique (encoder, policy, training pipeline)
-- Write clean, documented, tested code
-- Contribute a textbook section with implementation details
-
-**Example Projects:**
-- "Efficient vision encoder for real-time robotic control" - Optimized transformer
-- "Multi-task policy training framework" - PyTorch training harness
-- "Sim-to-real transfer toolkit" - Domain randomization + evaluation suite
-
-### Track 3: Comprehensive Survey
-
-Write an authoritative survey of a VLA subtopic.
-
-**Requirements:**
-- Read 15-20 papers in a focused area
-- Identify trends, gaps, and open questions
-- Contribute a textbook section synthesizing the literature
-
-**Example Projects:**
-- "Data augmentation strategies for robotic learning" - Survey + taxonomy
-- "Benchmarking protocols for manipulation tasks" - Analysis of evaluation methods
-- "Foundation models for embodied AI: A critical review" - Strengths/weaknesses analysis
-
-## Deliverables
-
-### 1. Proposal (Week 8)
-
-**Submit**: 1-2 page proposal via pull request
-
-**Contents:**
-- Track selection (Research/Engineering/Survey)
-- Problem statement and motivation
-- Planned approach and timeline
-- Expected contribution to textbook
-
-**Grading**: Pass/Fail (instructor feedback provided)
-
-### 2. Textbook Chapter Contribution (Week 16)
-
-**Submit**: MDX file with written content
-
-**Requirements:**
-- 2000-4000 words of technical writing
-- LaTeX equations for mathematical formulations
-- Code snippets (if applicable)
-- References to relevant papers
-- Fits cohesively into one of the 8 textbook chapters
-
-**Location**: `content/textbook/[chapter-name]/your-section.mdx`
-
-**Example Structure:**
+### Track 3: Synthesis & Taxonomy (The "Survey" Track)
+Write an authoritative survey of a VLA sub-domain.
+- **Requirement:** Read 15-20 papers. Identify the "Lineage of Failure" and the scaling laws of the sub-topic.
+- **Textbook Contribution:** A foundational chapter synthesizing the literature into a cohesive taxonomy.
 
-```mdx
 ---
-title: "3.5 Your Section Title"
-chapter: 3
-subsection: 5
-author: "Your Name"
----
-
-# 3.5 Your Section Title
-
-## Motivation
-
-Why does this topic matter?
-
-## Background
-
-What do readers need to know?
-
-## Method
-
-How does it work? (Include equations)
-
-## Results
-
-What did you find? (Include figures/tables)
-
-## Discussion
-
-What are the implications?
-
-## References
-
-[Numbered references]
-```
-
-### 3. Code Implementation (Weeks 12-16)
-
-**Required for Research & Engineering tracks** (optional for Survey track)
-
-**Submit**: Pull request with code
-
-**Requirements:**
-- Clean, documented Python code
-- README with setup instructions
-- Example usage / demo script
-- Unit tests (if applicable)
-
-**Location**: `code/capstone/your-project-name/`
-
-**Grading Criteria:**
-- Code quality and organization (30%)
-- Documentation and comments (30%)
-- Functionality and correctness (40%)
 
-### 4. Final Presentation (Week 16)
+## Technical Requirements
 
-**Format**: 15-minute presentation + 5-minute Q&A
+### 1. The Architectural Delta
+Your project must identify a specific bottleneck in a "Primary Paper." You are not parrots; you are auditors. If you choose Track 1 or 2, you must justify your architectural changes using the **Amazon Principle**: write a technical specification that proves why this change is necessary.
 
-**Contents:**
-1. Problem statement and motivation (2 min)
-2. Approach and methodology (5 min)
-3. Results and findings (5 min)
-4. Textbook contribution overview (2 min)
-5. Lessons learned and future work (1 min)
+### 2. The Data Mix
+You must explicitly define your data curation strategy:
+- **Foundational Priors:** Which internet-scale weights (SigLIP, DINOv2) are you using?
+- **Embodied Data:** Which subset of Open X-Embodiment or DROID are you sampling?
+- **Synthetic Multiplication:** Are you using *MimicGen* or *RoboGen* to scale your seeds?
 
-**Slides**: Submit PDF via pull request
+### 3. Formalized Logic & Derivations
+Your documentation must be grounded in $\LaTeX$. 
+- Derive your specific loss function $\mathcal{L}_{total}$.
+- Define the state-space $S$ and the action-space $A$ (e.g., Delta-EE, Joint Velocities, or Latent Tokens).
 
-## Timeline
+### 4. Semantic Form
+All MDX contributions must follow the **Semantic Line Break** rule (one sentence per line). This is mandatory for the PR review process.
 
-| Week | Milestone |
-|------|-----------|
-| 8 | Proposal due |
-| 10 | Progress check-in (office hours) |
-| 12 | Draft textbook section (optional feedback) |
-| 14 | Code implementation complete |
-| 16 | Final presentation + all deliverables due |
-
-## Grading Rubric (300 points)
-
-| Component | Points |
-|-----------|--------|
-| **Textbook Contribution** | **150** |
-| - Technical accuracy | 50 |
-| - Writing clarity | 40 |
-| - Integration with existing chapters | 30 |
-| - References and citations | 30 |
-| **Implementation / Code** | **100** |
-| - Functionality | 40 |
-| - Code quality | 30 |
-| - Documentation | 30 |
-| **Presentation** | **50** |
-| - Content clarity | 20 |
-| - Slide quality | 15 |
-| - Q&A responses | 15 |
-| **Total** | **300** |
-
-## Evaluation Criteria
-
-### Textbook Contribution
-
-**Excellent (90-100%)**:
-- Novel insights or analysis
-- Crystal-clear explanations
-- Publication-quality figures and equations
-- Comprehensive references
-
-**Good (80-89%)**:
-- Accurate technical content
-- Clear writing with minor issues
-- Relevant figures and equations
-- Adequate references
-
-**Acceptable (70-79%)**:
-- Mostly accurate content
-- Understandable but needs polish
-- Basic figures/equations
-- Some key references missing
-
-### Code Implementation
-
-**Excellent (90-100%)**:
-- Production-ready code
-- Comprehensive documentation
-- Runs out-of-the-box
-- Includes tests and examples
-
-**Good (80-89%)**:
-- Functional code
-- Adequate documentation
-- Minor setup issues
-- Basic examples
-
-**Acceptable (70-79%)**:
-- Code works with effort
-- Minimal documentation
-- Requires debugging
-- No examples
-
-### Presentation
-
-**Excellent (90-100%)**:
-- Engaging and clear
-- Well-structured slides
-- Confident Q&A responses
-- On time
-
-**Good (80-89%)**:
-- Clear presentation
-- Decent slides
-- Handles most questions
-- Slightly over/under time
-
-**Acceptable (70-79%)**:
-- Understandable content
-- Basic slides
-- Struggles with some questions
-- Noticeable timing issues
-
-## Submission Process
-
-### Proposal (Week 8)
-
-```bash
-git checkout -b capstone-proposal-yourname
-# Add file: content/course/proposals/yourname-proposal.md
-git add content/course/proposals/yourname-proposal.md
-git commit -m "Add capstone proposal: Your Name"
-git push origin capstone-proposal-yourname
-# Open PR to staging
-```
-
-### Final Submission (Week 16)
-
-```bash
-git checkout -b capstone-final-yourname
-# Add textbook section: content/textbook/[chapter]/your-section.mdx
-# Add code (if applicable): code/capstone/your-project/
-# Add slides: presentations/yourname-final.pdf
-git add .
-git commit -m "Add capstone project: Your Title"
-git push origin capstone-final-yourname
-# Open PR to staging
-```
-
-## Example Projects from Past Semesters
-
-### Research Track
-
-**"Generalization of RT-2 to Novel Objects"** (Jane Doe, 2025)
-- Reproduced RT-2 baseline on Open-X dataset
-- Generated 50 novel 3D objects with unseen geometries
-- Found 23% performance drop on novel objects
-- Contributed to Chapter 4 (Evaluation)
-
-**"Data Augmentation for Robotic Grasping"** (John Smith, 2025)
-- Implemented 8 augmentation strategies
-- Trained policies with systematic ablations
-- Identified that rotation augmentation improves generalization by 15%
-- Contributed to Chapter 2 (Data)
-
-### Engineering Track
-
-**"Real-Time Vision Encoder for Edge Deployment"** (Alice Johnson, 2025)
-- Implemented MobileViT-based encoder
-- Achieved 30 FPS on Jetson Orin
-- Only 5% accuracy drop vs. ViT-B
-- Contributed to Chapter 5 (Deployment)
-
-**"Multi-Task Policy Training Framework"** (Bob Williams, 2025)
-- Built PyTorch training harness for 10+ tasks
-- Supports multi-GPU, checkpointing, logging
-- Open-sourced with 500+ GitHub stars
-- Contributed to Chapter 3 (Training)
-
-### Survey Track
-
-**"Benchmarking Protocols for Manipulation"** (Carol Lee, 2025)
-- Analyzed 30 papers on manipulation benchmarks
-- Created taxonomy of evaluation metrics
-- Identified reproducibility issues in 60% of papers
-- Contributed to Chapter 4 (Evaluation)
-
-**"Foundation Models for Embodied AI: A Survey"** (David Chen, 2025)
-- Surveyed 40 papers on VLMs for robotics
-- Mapped landscape of architectures and datasets
-- Identified key open problems
-- Contributed to Chapter 7 (Future Directions)
-
-## Resources
-
-### Writing
-
-- [How to Write a Great Research Paper](https://www.microsoft.com/en-us/research/academic-program/write-great-research-paper/)
-- [LaTeX Math Symbols](https://www.overleaf.com/learn/latex/List_of_Greek_letters_and_math_symbols)
-- [MDX Documentation](https://mdxjs.com/)
-
-### Code
-
-- [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html)
-- [PyTorch Best Practices](https://pytorch.org/tutorials/beginner/saving_loading_models.html)
-- [Writing Good Documentation](https://www.writethedocs.org/guide/writing/beginners-guide-to-docs/)
-
-### Presentation
-
-- [How to Give a Great Research Talk](https://www.microsoft.com/en-us/research/academic-program/give-great-research-talk/)
-- [Presentation Tips](https://www.cs.cmu.edu/~wloescher/presentations.html)
+---
 
-## FAQs
+## Team Structure: The $2\times$ Rule
 
-**Q: Can I work in a team?**
-A: No, capstone projects must be individual work. However, you can discuss ideas with classmates.
+- **Individual Work:** The baseline for a high-quality contribution.
+- **Group Work (Optional):** If you choose to work in a group, the technical bar for "Mastery" scales linearly. A 2-person team must go **$2\times$ as far**—meaning significantly larger data mixes, more robust baseline comparisons, or cross-embodiment evaluation. 
+- **Note:** Groups must provide a "Team Contribution Statement" in their proposal.
 
-**Q: Can I extend my paper audit into a capstone?**
-A: Yes! If you found an interesting research question during a paper audit, you can explore it further.
+---
 
-**Q: What if my code doesn't work perfectly?**
-A: Document what worked, what didn't, and why. Partial results are acceptable if well-analyzed.
+## Deliverables & Grading Rubric (250 Points Total)
 
-**Q: Can I contribute to multiple textbook chapters?**
-A: Focus on one cohesive section. Quality over quantity.
+### 1. Project Specification / Proposal (Pass/Fail - First Architectural Lab)
+Submit via the **VLA Architecture Lab Form**. Includes team members, the "Initial Dissolve," and compute/data requirements.
 
-**Q: What if my project scope changes?**
-A: Discuss with the instructor. Pivots are allowed with justification.
+### 2. Textbook Chapter Contribution (100 Points)
+- **Technical Accuracy & Rigor (50 pts):** Correct $\LaTeX$, sound mathematical derivations, and deep critique.
+- **Writing & Insights (50 pts):** Must include *Lineage of Failure*, *Intuitive Derivations*, and *Implementation Gotchas*.
 
-## Getting Help
+### 3. Code Implementation (75 Points)
+- **Functionality & Correctness (50 pts):** Does it solve the stated bottleneck?
+- **Code Quality & Docs (25 pts):** Clean Python, README with setup, and unit tests. 
 
-- **Office Hours**: Every Tuesday/Thursday 3-4 PM
-- **Discussion Forum**: Post questions and get peer feedback
-- **Mid-Project Check-In**: Schedule a meeting in Week 12
+### 4. Final Presentation (75 Points)
+- **Content Density (50 pts):** 15-minute technical brief.
+- **Q&A Rigor (25 pts):** Ability to defend your load-bearing assertions.
 
-## Final Notes
+---
 
-The capstone is your opportunity to make a lasting contribution to the VLA research community. Past student projects have been cited in papers, used by other researchers, and featured in the textbook for future cohorts.
+## Submission Process: The PR Workflow
 
-**Aim for work you'd be proud to showcase in a job interview or PhD application.**
+1. **Branching:** `git checkout -b project/your-handle-topic`
+2. **Pathing:** - **Textbook:** `content/textbook/[chapter]/your-section.mdx`
+   - **Code:** `code/capstone/your-project/`
+   - **Slides:** `presentations/your-name-final.pdf`
+3. **The Loop:** Open a PR to `staging`. A bot will provide a preview link. Iterate until your project reaches **Level 3 (Mastery)** and is merged into the `main` textbook.
 
-Good luck!
+> **Final Note:** The capstone is your opportunity to make a lasting contribution to the VLA research community. Aim for work you would be proud to showcase in an AI Engineering interview.
diff --git a/content/course/assignments/scratch-1.mdx b/content/course/assignments/scratch-1.mdx
index 2b1f2bce..cdb5636d 100644
--- a/content/course/assignments/scratch-1.mdx
+++ b/content/course/assignments/scratch-1.mdx
@@ -5,11 +5,6 @@ due: 'Sunday, February 1, 9:00 AM MST'
 points: 100
 ---
 
-<div className="bg-red-100 border-l-4 border-red-500 text-red-700 px-6 py-4 mb-8 rounded">
-  <p className="font-bold text-xl">⚠️ DRAFT: NOT YET ASSIGNED</p>
-  <p className="mt-2">This assignment is still under review and subject to change. Do not begin work until this notice is removed.</p>
-</div>
-
 # Scratch-1: The Transformer Backbone
 
 **Focus**: Implementing the $O(1)$ engine of the VLA stack.
@@ -187,7 +182,9 @@ When I removed the causal mask, the following happened:
 ### Pass Level (B): 70-89 points
 
 - ✅ Successful implementation of the backbone
-- ✅ Loss converges on the synthetic dataset (< 1.0)
+- ✅ Loss shows clear convergence (appreciable decrease from initial loss)
+  - Expected: Initial loss ~3-4, Final loss ~1.9-2.2
+  - Model should demonstrate learning, not achieve arbitrary threshold
 - ✅ Attention maps visualization included
 - ✅ Causal mask audit completed
 - ✅ Code is clean and documented
@@ -307,6 +304,13 @@ A: Check:
 2. Is the learning rate too high? (Try 1e-4)
 3. Are gradients exploding? (Enable gradient clipping)
 
+**Q: What loss should I expect?**
+A: With correct implementation on the synthetic trajectory dataset:
+- **Initial loss**: ~3-4 (near random guessing for 256-way classification)
+- **Final loss**: ~1.9-2.2 (showing clear learning)
+- **Key metric**: Appreciable decrease indicating the model learns patterns
+- The action encoding represents direction + magnitude toward target, which is learnable but not trivial
+
 ## 11. Deadline
 
 **Due**: Sunday, February 1, 9:00 AM MST
diff --git a/content/course/submissions/scratch-1/Soorej S Nair.mdx b/content/course/submissions/scratch-1/Soorej S Nair.mdx
new file mode 100644
index 00000000..9b291ef7
--- /dev/null
+++ b/content/course/submissions/scratch-1/Soorej S Nair.mdx	
@@ -0,0 +1,133 @@
+---
+title: "Scratch-1 Submission: Soorej S Nair"
+student: "Soorej S Nair"
+date: "2026-02-03"
+---
+
+# Scratch-1: The Transformer Backbone
+
+## Loss Curve
+
+![Training Loss](./images/loss_curve.png)
+
+The model converged after around 10 iterations with final loss of 1.9277.
+
+## Attention Visualization
+
+![Attention Maps](./images/attention_maps.png)
+
+The attention patterns show the causal mask has worked. The dark upper right triangle does not receive any attention, this demonstrates that because of the causal mask, the future tokens are masked and are not passed into the model. The bright diagonal shows that the model focuses most on the latest token in the input sequence. The attention reduces as we go away from the recent tokens, as expected.
+
+### Sample output
+```
+Using device: cpu
+Epoch 1 Batch 100: avg_loss = 4.142927
+Epoch 1 Batch 200: avg_loss = 3.567831
+Epoch 1/20 - Loss: 3.2584
+Epoch 2 Batch 100: avg_loss = 2.299740
+Epoch 2 Batch 200: avg_loss = 2.243028
+Epoch 2/20 - Loss: 2.2111
+Epoch 3 Batch 100: avg_loss = 2.099218
+Epoch 3 Batch 200: avg_loss = 2.090066
+Epoch 3/20 - Loss: 2.0819
+Epoch 4 Batch 100: avg_loss = 2.046171
+Epoch 4 Batch 200: avg_loss = 2.041124
+Epoch 4/20 - Loss: 2.0388
+Epoch 5 Batch 100: avg_loss = 2.021413
+Epoch 5 Batch 200: avg_loss = 2.018425
+Epoch 5/20 - Loss: 2.0147
+Epoch 6 Batch 100: avg_loss = 2.001702
+Epoch 6 Batch 200: avg_loss = 2.000735
+Epoch 6/20 - Loss: 1.9991
+Epoch 7 Batch 100: avg_loss = 1.989224
+Epoch 7 Batch 200: avg_loss = 1.988070
+Epoch 7/20 - Loss: 1.9868
+Epoch 8 Batch 100: avg_loss = 1.981476
+Epoch 8 Batch 200: avg_loss = 1.981199
+Epoch 8/20 - Loss: 1.9792
+Epoch 9 Batch 100: avg_loss = 1.971384
+Epoch 9 Batch 200: avg_loss = 1.971669
+Epoch 9/20 - Loss: 1.9706
+Epoch 10 Batch 100: avg_loss = 1.963031
+Epoch 10 Batch 200: avg_loss = 1.963841
+Epoch 10/20 - Loss: 1.9645
+Epoch 11 Batch 100: avg_loss = 1.956031
+Epoch 11 Batch 200: avg_loss = 1.958478
+Epoch 11/20 - Loss: 1.9595
+Epoch 12 Batch 100: avg_loss = 1.950375
+Epoch 12 Batch 200: avg_loss = 1.952415
+Epoch 12/20 - Loss: 1.9539
+Epoch 13 Batch 100: avg_loss = 1.945426
+Epoch 13 Batch 200: avg_loss = 1.948742
+Epoch 13/20 - Loss: 1.9498
+Epoch 14 Batch 100: avg_loss = 1.942955
+Epoch 14 Batch 200: avg_loss = 1.944969
+Epoch 14/20 - Loss: 1.9462
+Epoch 15 Batch 100: avg_loss = 1.941023
+Epoch 15 Batch 200: avg_loss = 1.941831
+Epoch 15/20 - Loss: 1.9429
+Epoch 16 Batch 100: avg_loss = 1.936545
+Epoch 16 Batch 200: avg_loss = 1.938385
+Epoch 16/20 - Loss: 1.9388
+Epoch 17 Batch 100: avg_loss = 1.931708
+Epoch 17 Batch 200: avg_loss = 1.934831
+Epoch 17/20 - Loss: 1.9356
+Epoch 18 Batch 100: avg_loss = 1.929214
+Epoch 18 Batch 200: avg_loss = 1.931722
+Epoch 18/20 - Loss: 1.9326
+Epoch 19 Batch 100: avg_loss = 1.926539
+Epoch 19 Batch 200: avg_loss = 1.928762
+Epoch 19/20 - Loss: 1.9301
+Epoch 20 Batch 100: avg_loss = 1.923337
+Epoch 20 Batch 200: avg_loss = 1.926616
+Epoch 20/20 - Loss: 1.9277
+Saved loss curve to /Users/soorejsnair/Documents/code/Course Folder/Spring 26/CSCI 7000/vla-foundations/content/course/submissions/scratch-1/images/loss_curve.png
+Saved attention heatmap to /Users/soorejsnair/Documents/code/Course Folder/Spring 26/CSCI 7000/vla-foundations/content/course/submissions/scratch-1/images/attention_maps.png
+```
+## The Audit: Removing the Causal Mask
+
+When I removed the causal mask, the following happened:
+
+* Loss observed
+![Training Loss](./images/loss_curve_without_causal_mask.png)
+We see that the loss converges quickly with a final loss of 0.0460 which is much lower than the one with the causal mask. This is because of data leak. Since we do not use causal mask the model uses the future tokens to train the model and causes accuracy to skyrocket since it is predicting tokens it is already trained on.
+
+
+* Attention Map
+![Attention Maps](./images/attention_maps_without_causal_mask.png)
+The attention map also says the same thing, instead of the lower triangle alone, the attention is now spread over all the tokens (including the future tokens)
+
+### Why the Model "Cheats"
+
+* This behaviour shows that, the model makes a prediction using the future rokens, which is unlike the expected behavious of autoregressive models. The model knowing the future token, training on it and then making a prediction with very low loss is considered "cheating"
+
+
+## Why RoPE Is Superior to Sinusoidal Positional Embeddings
+
+In standard sinusoidal positional embeddings, absolute position vectors are added directly to token embeddings before attention is computed. This causes the dot-product attention score to entangle content and absolute position, making the model sensitive to when a pattern occurs rather than how tokens relate to each other.
+
+Rotary Positional Embeddings (RoPE) instead apply a position-dependent rotation to the query and key vectors. This transforms the attention score as:
+
+$ (R(i)q_i)^\top (R(j)k_j) = q_i^\top R(j - i) k_j $
+
+
+This formula shows that attention depends explicitly on the relative difference between the tokens rather than on their absolute positions. For spatial and trajectory data, where motion patterns are translation-invariant, this is a more appropriate inductive bias.
+
+### Ablation: RoPE vs Sinusoidal Embeddings
+
+We trained two identical models differing only in their positional encoding scheme. The RoPE-based model converged faster and achieved lower final loss (1.9277) compared to the sinusoidal baseline(1.9858). This confirms that relative positional encoding improves learning efficiency and generalization for spatial robot trajectories.
+
+![Training Loss](./images/loss_curve_sinusoidal.png)
+
+## Code Highlights
+
+* **Configurable Positional Encoding (RoPE vs. Sinusoidal).**
+  * The implementation supports both Rotary Positional Embeddings (RoPE) and sinusoidal positional embeddings using the global variable SINUSOIDAL flag. When SINUSOIDAL is False, RoPE embeddings are used and when SINUSOIDAL = True, absolute sinusoidal embeddings are passed to the model.
+
+* **KV caching for efficient inference.**
+
+  * KV caching reduces inference complexity from O(T²) to O(T) by reusing key/value tensors across decoding steps. Empirically, generation latency per token drops significantly after the first token.
+
+* **Causal mask control**
+  * Causal masking is implemented explicitly using a lower-triangular attention mask and can be toggled via the CAUSAL_MASKING flag. This allowed direct inspection of the “causal cheating” behavior when the mask is disabled, making the effect of information leakage easy to observe both in loss curves and attention visualizations.
+
diff --git a/content/course/submissions/scratch-1/images/attention_maps.png b/content/course/submissions/scratch-1/images/attention_maps.png
new file mode 100644
index 00000000..3915b39e
Binary files /dev/null and b/content/course/submissions/scratch-1/images/attention_maps.png differ
diff --git a/content/course/submissions/scratch-1/images/attention_maps_sinusoidal.png b/content/course/submissions/scratch-1/images/attention_maps_sinusoidal.png
new file mode 100644
index 00000000..60711548
Binary files /dev/null and b/content/course/submissions/scratch-1/images/attention_maps_sinusoidal.png differ
diff --git a/content/course/submissions/scratch-1/images/attention_maps_without_causal_mask.png b/content/course/submissions/scratch-1/images/attention_maps_without_causal_mask.png
new file mode 100644
index 00000000..b7ca6c1e
Binary files /dev/null and b/content/course/submissions/scratch-1/images/attention_maps_without_causal_mask.png differ
diff --git a/content/course/submissions/scratch-1/images/loss_curve.png b/content/course/submissions/scratch-1/images/loss_curve.png
new file mode 100644
index 00000000..1a1780c4
Binary files /dev/null and b/content/course/submissions/scratch-1/images/loss_curve.png differ
diff --git a/content/course/submissions/scratch-1/images/loss_curve_sinusoidal.png b/content/course/submissions/scratch-1/images/loss_curve_sinusoidal.png
new file mode 100644
index 00000000..46777ba2
Binary files /dev/null and b/content/course/submissions/scratch-1/images/loss_curve_sinusoidal.png differ
diff --git a/content/course/submissions/scratch-1/images/loss_curve_without_causal_mask.png b/content/course/submissions/scratch-1/images/loss_curve_without_causal_mask.png
new file mode 100644
index 00000000..6eb5dac8
Binary files /dev/null and b/content/course/submissions/scratch-1/images/loss_curve_without_causal_mask.png differ
diff --git a/data/trajectories.pkl b/data/trajectories.pkl
new file mode 100644
index 00000000..d5b55493
Binary files /dev/null and b/data/trajectories.pkl differ
diff --git a/grading_reports/GRADING_REPORT.md b/grading_reports/GRADING_REPORT.md
new file mode 100644
index 00000000..ad99928c
--- /dev/null
+++ b/grading_reports/GRADING_REPORT.md
@@ -0,0 +1,65 @@
+![Chris-Bot](~/chris_robot.png)
+### 🤖 Chris's Grading Assistant - Feedback Report
+
+**Student:** @Soorej30
+**PR:** #45
+**Branch:** `scratch-1-Soorej30`
+
+Hi! I've reviewed your submission. Here's what I found:
+
+---
+
+## 📊 Component Feedback
+
+### ✅ Causal Self-Attention
+
+✅ Perfect! Your causal mask correctly prevents future token leakage.
+
+✅ Test passed.
+
+### ✅ RMSNorm
+
+✅ RMSNorm implemented correctly with proper normalization and learnable scale.
+
+✅ Test passed.
+
+### ✅ Training Loop
+
+✅ Excellent! Your model trains successfully and loss converges.
+
+### ✅ RoPE Embeddings
+
+✅ RoPE correctly applied to Q and K tensors.
+
+### ✅ Model Architecture
+
+✅ Model forward pass works end-to-end with correct output shapes.
+
+✅ Model has the expected number of trainable parameters.
+
+### ✅ Code Quality
+
+Your code imports and runs cleanly. Nice! ✨
+
+---
+
+## 📝 Documentation & Analysis
+
+✅ Report submitted! I found:
+- `content/course/submissions/scratch-1/Soorej S Nair.mdx`
+- `README.md`
+
+Your instructor will review the quality of your analysis.
+
+---
+
+## 🎯 Mastery Features Detected
+
+I noticed you implemented:
+- RoPE vs Sinusoidal ablation study
+
+Great work going beyond the requirements! Your instructor will verify implementation quality.
+
+---
+
+> *Grading is automated but reviewed by an instructor. If you have questions, reach out on Slack!*
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..fa8661a8
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,55 @@
+[project]
+name = "vla-foundations"
+version = "0.1.0"
+description = "VLA Foundations Course - Private Instructor Repository"
+readme = "README.md"
+requires-python = ">=3.10,<3.14"
+dependencies = [
+    "torch>=2.0.0",
+    "torchvision",
+    "numpy>=1.24.0",
+    "pytest>=7.0.0",
+    "pytest-html>=4.0.0",
+    "matplotlib>=3.5.0",
+    "pandas>=2.0.0",
+]
+
+[[tool.uv.index]]
+name = "pytorch-cpu"
+url = "https://download.pytorch.org/whl/cpu"
+explicit = true
+
+[[tool.uv.index]]
+name = "pytorch-cu118"
+url = "https://download.pytorch.org/whl/cu118"
+explicit = true
+
+[tool.uv.sources]
+torch = [
+    { index = "pytorch-cpu", marker = "sys_platform == 'darwin'" },
+    { index = "pytorch-cu118", marker = "sys_platform == 'linux'" }
+]
+torchvision = [
+    { index = "pytorch-cpu", marker = "sys_platform == 'darwin'" },
+    { index = "pytorch-cu118", marker = "sys_platform == 'linux'" }
+]
+
+[tool.hatch.build.targets.wheel]
+packages = []
+
+[tool.pytest.ini_options]
+markers = [
+    "internal: internal grading tests (never public)",
+    "rigor: rigorous grading tests",
+    "gradient: gradient flow tests",
+    "fidelity: output comparison tests",
+    "training: training convergence tests",
+    "mastery: optional mastery-level features (DINOv2, KV-cache, etc.)",
+]
+testpaths = ["tests"]
+python_files = ["test_*.py"]
+python_classes = ["Test*"]
+python_functions = ["test_*"]
+
+[dependency-groups]
+dev = []
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 00000000..ea7a96da
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,13 @@
+[pytest]
+testpaths = tests
+python_files = test_*.py
+python_classes = Test*
+python_functions = test_*
+markers =
+    public: Tests that students can see and run
+    internal: Internal grading tests (never public)
+    rigor: Rigorous validation tests for grading
+    gradient: Tests for gradient flow validation
+    fidelity: Tests for output quality validation
+    training: Tests for training convergence
+    mastery: Optional mastery-level features (DINOv2, KV-cache, etc.)
diff --git a/scripts/README.md b/scripts/README.md
new file mode 100644
index 00000000..0e328843
--- /dev/null
+++ b/scripts/README.md
@@ -0,0 +1,49 @@
+# CI/CD Scripts
+
+**Critical infrastructure scripts** used in GitHub Actions workflows.
+
+## Contents
+
+### Production Scripts
+
+- **`manage_solutions.py`** - Inject/reset assignment solutions (used in testing)
+- **`sanitize.sh`** - Main sanitization pipeline for public sync
+- **`_sanitize_todos.py`** - Remove solution hints from code
+- **`audit_linter.py`** - Validate paper audit MDX files
+
+### Usage in CI/CD
+
+| Script | Workflow | Purpose |
+|--------|----------|---------|
+| `audit_linter.py` | `vla-audit.yml` | Validate audit frontmatter |
+| `sanitize.sh` | `sync-to-public.yml` | Remove private content |
+| `_sanitize_todos.py` | `sync-to-public.yml` | Strip solution hints |
+| `manage_solutions.py` | (local testing) | Inject/reset solutions |
+
+### Critical Requirements
+
+1. **Fail-Safe**: All scripts must return non-zero exit codes on failure
+2. **Idempotent**: Can be run multiple times safely
+3. **Validated**: Must pass linting before sync
+4. **Documented**: Clear error messages and usage
+
+## Development Scripts
+
+Local development helpers are in `scripts/dev/`. These are **not** used in CI/CD.
+
+## Modification Guidelines
+
+Changes to scripts in this directory affect production workflows. Always:
+
+1. Test locally first
+2. Verify exit codes
+3. Check GitHub Actions logs
+4. Update documentation
+
+## Security
+
+These scripts handle sensitive operations:
+- `sanitize.sh` - Removes private content before public sync
+- `manage_solutions.py` - Manages private solutions
+
+Never commit secrets or tokens to these scripts.
diff --git a/scripts/audit_linter.py b/scripts/audit_linter.py
index 8ecfa1b8..ff1952a6 100755
--- a/scripts/audit_linter.py
+++ b/scripts/audit_linter.py
@@ -32,6 +32,62 @@ def check_semantic_breaks(file_path):
             )
     return errors
 
+def validate_frontmatter(file_path, content, lines):
+    """Validate YAML frontmatter contains required fields."""
+    errors = []
+
+    # Extract frontmatter
+    if not content.startswith('---'):
+        errors.append(
+            f"{file_path}: Missing YAML frontmatter. File must start with '---' followed by "
+            "title, author, paper, and topic fields."
+        )
+        return errors
+
+    # Find the end of frontmatter
+    frontmatter_end = None
+    for i, line in enumerate(lines[1:], start=1):
+        if line.strip() == '---':
+            frontmatter_end = i
+            break
+
+    if frontmatter_end is None:
+        errors.append(
+            f"{file_path}: Malformed YAML frontmatter. Missing closing '---'."
+        )
+        return errors
+
+    frontmatter_lines = lines[1:frontmatter_end]
+    frontmatter_text = '\n'.join(frontmatter_lines)
+
+    # Required fields for audit MDX files
+    required_fields = ['title', 'author', 'topic', 'paper']
+
+    for field in required_fields:
+        # Check if field exists (case-insensitive)
+        if not any(line.strip().lower().startswith(f'{field}:') for line in frontmatter_lines):
+            errors.append(
+                f"{file_path}: Missing required frontmatter field: '{field}'"
+            )
+
+    # Validate field values are not empty
+    for line in frontmatter_lines:
+        stripped = line.strip()
+        if ':' in stripped:
+            field_name, field_value = stripped.split(':', 1)
+            field_name = field_name.strip().lower()
+            field_value = field_value.strip()
+
+            if field_name in required_fields:
+                # Check for empty values or placeholder values
+                if not field_value or field_value in ['""', "''", 'null', 'TBD', 'TODO']:
+                    errors.append(
+                        f"{file_path}: Empty or placeholder value for required field: '{field_name}'"
+                    )
+
+    return errors
+
+
 def check_mdx_syntax(file_path):
     """Check for MDX-specific syntax issues."""
     with open(file_path, 'r', encoding='utf-8') as f:
@@ -40,12 +96,8 @@ def check_mdx_syntax(file_path):
 
     errors = []
 
-    # Check 1: Must have YAML frontmatter at the start
-    if not content.startswith('---'):
-        errors.append(
-            f"{file_path}: Missing YAML frontmatter. File must start with '---' followed by "
-            "title, author, paper, and topic fields."
-        )
+    # Check 1: Validate frontmatter fields
+    errors.extend(validate_frontmatter(file_path, content, lines))
 
     # Check 2: No HTML comments (should use JSX-style {/* */})
     if '<!--' in content:
diff --git a/src/assignments/scratch-1/backbone.py b/src/assignments/scratch-1/backbone.py
index 247227d1..05be1d6c 100644
--- a/src/assignments/scratch-1/backbone.py
+++ b/src/assignments/scratch-1/backbone.py
@@ -14,7 +14,16 @@
 import torch.nn.functional as F
 import math
 from typing import Optional, Tuple
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from pathlib import Path
+import os
 
+# When SINUSOIDAL is False, RoPE embeddings are used and when SINUSOIDAL = True, absolute sinusoidal embeddings are passed to the model.
+SINUSOIDAL = False
+# Flag to turn ON/OFF the causal masking. when CAUSAL_MASKING = True causal masking is applied.
+CAUSAL_MASKING = True
 
 class RMSNorm(nn.Module):
     """
@@ -31,9 +40,8 @@ class RMSNorm(nn.Module):
     def __init__(self, dim: int, eps: float = 1e-6):
         super().__init__()
         self.eps = eps
-        # TODO: Initialize learnable scale parameter 'g' (gamma)
-        # Hint: Use nn.Parameter with torch.ones
-        self.scale = None  # REPLACE THIS LINE
+        # Initialize a learnable scale parameter of length dim with ones
+        self.scale = nn.Parameter(torch.ones(dim))
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
@@ -42,16 +50,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         Returns:
             Normalized tensor of same shape
         """
-        # TODO: Implement RMSNorm
+        # Implementation of RMSNorm
         # Step 1: Compute RMS (root mean square) along the last dimension
         # Step 2: Normalize by dividing x by RMS
         # Step 3: Apply learnable scale parameter
-
-        # HINT: Use torch.mean, torch.rsqrt for efficiency
-        # rms = torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
-
-        raise NotImplementedError("TODO: Implement RMSNorm forward pass")
-
+        rms = torch.rsqrt(x.pow(2).mean(dim = -1, keepdim = True) + self.eps)
+        xnorm = x* rms
+        return xnorm * self.scale
 
 class RotaryPositionalEmbedding(nn.Module):
     """
@@ -111,6 +116,27 @@ def forward(self, q: torch.Tensor, k: torch.Tensor) -> Tuple[torch.Tensor, torch
 
         return q_rot, k_rot
 
+class SinusoidalPositionalEmbedding(nn.Module):
+    def __init__(self, dim, max_len=2048):
+        super().__init__()
+        # Create a positional encoding matrix of shape (max_len, dim)
+        pe = torch.zeros(max_len, dim)
+        position = torch.arange(0, max_len).unsqueeze(1)
+
+        # Compute the frequency scaling factor for each even dimension 
+        # This controls how fast the sinusoids oscillate at different dimensions
+        div_term = torch.exp(
+            torch.arange(0, dim, 2) * (-math.log(10000.0) / dim)
+        )
+
+        # Apply sine to even and cos to odd indices in the embedding dimension
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        self.register_buffer("pe", pe)
+
+    def forward(self, x):
+        # Add positional embeddings to token embeddings
+        return x + self.pe[:x.size(1)]
 
 class CausalSelfAttention(nn.Module):
     """
@@ -139,9 +165,15 @@ def __init__(self, dim: int, num_heads: int, dropout: float = 0.1):
         self.resid_dropout = nn.Dropout(dropout)
 
         # Rotary embeddings
-        self.rope = RotaryPositionalEmbedding(self.head_dim)
-
-    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        if not SINUSOIDAL:
+            self.rope = RotaryPositionalEmbedding(self.head_dim)
+
+    def forward(self,
+            x: torch.Tensor,
+            mask: Optional[torch.Tensor] = None,
+            past_kv: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+            use_cache: bool = False,
+        ):
         """
         Args:
             x: Input tensor (batch, seq_len, dim)
@@ -151,39 +183,65 @@ def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch
         """
         batch_size, seq_len, _ = x.shape
 
-        # TODO: Implement Causal Self-Attention
+        # Implementation of Causal Self-Attention
 
         # Step 1: Project input to Q, K, V
-        # qkv = self.qkv_proj(x)  # (batch, seq_len, 3*dim)
+        qkv = self.qkv_proj(x)  # (batch, seq_len, 3*dim)
         # Split into Q, K, V and reshape for multi-head attention
-        # Hint: Use .view() and .transpose() to get shape (batch, num_heads, seq_len, head_dim)
+        qkv = qkv.view((batch_size, seq_len, 3, self.num_heads, self.head_dim))
+        qkv = qkv.permute(2, 0, 3, 1, 4)
+
+        Q, K, V = qkv[0], qkv[1], qkv[2]
+
+        # # Step 2: Apply RoPE to Q and K
+        # Apply RoPE only to current tokens
+        if not SINUSOIDAL:
+            Q, K = self.rope(Q, K)
+
+        # KV caching
+        if past_kv is not None:
+            past_K, past_V = past_kv
+            K = torch.cat([past_K, K], dim=2)
+            V = torch.cat([past_V, V], dim=2)
+
+        present_kv = (K, V) if use_cache else None
 
-        # Step 2: Apply RoPE to Q and K
-        # q, k = self.rope(q, k)
 
         # Step 3: Compute attention scores
         # scores = (Q @ K^T) / sqrt(d_k)
-        # Hint: Use torch.matmul or @ operator
-        # Shape should be (batch, num_heads, seq_len, seq_len)
+        scores = torch.matmul(Q, K.transpose(-2, -1)) * self.scale
 
         # Step 4: Apply causal mask
         # The mask should prevent position i from attending to positions > i
-        # Hint: Create a lower-triangular matrix using torch.tril
-        # Set masked positions to -inf BEFORE softmax
-        # Example: scores = scores.masked_fill(mask == 0, float('-inf'))
+        # Check if cache is used, if not used, then create mask 
+        if not use_cache and CAUSAL_MASKING:
+            if mask is None:
+                mask = torch.tril(torch.ones(seq_len, seq_len, device=x.device))
+            if mask.dim() == 2:
+                mask = mask.unsqueeze(0).unsqueeze(0)
+            scores = scores.masked_fill(mask == 0, -1e9)
+        
 
         # Step 5: Apply softmax and dropout
-        # attn_weights = F.softmax(scores, dim=-1)
-        # attn_weights = self.attn_dropout(attn_weights)
+        attn_weights = F.softmax(scores, dim=-1)
+        attn_weights = self.attn_dropout(attn_weights)
+
+        try:
+            self.last_attn = attn_weights.detach().cpu()
+        except Exception:
+            self.last_attn = None
 
         # Step 6: Apply attention to values
         # out = attn_weights @ V
+        out = torch.matmul(attn_weights, V)
 
         # Step 7: Reshape and project back
         # Concatenate heads and apply output projection
-        # Hint: Use .transpose() and .contiguous().view() to reshape
+        out = out.transpose(1, 2).contiguous().view(batch_size, seq_len, self.dim)
+        out = self.out_proj(out)
+        out = self.resid_dropout(out)
 
-        raise NotImplementedError("TODO: Implement CausalSelfAttention forward pass")
+        return out, present_kv
 
 
 class FeedForward(nn.Module):
@@ -230,7 +288,7 @@ def __init__(self, dim: int, num_heads: int, ff_hidden_dim: int, dropout: float
         self.norm1 = RMSNorm(dim)
         self.norm2 = RMSNorm(dim)
 
-    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def forward(self, x, mask=None, past_kv=None, use_cache=False):
         """
         Args:
             x: Input tensor (batch, seq_len, dim)
@@ -238,11 +296,16 @@ def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch
         Returns:
             Output tensor (batch, seq_len, dim)
         """
+        attn_out, present_kv = self.attention(
+            self.norm1(x),
+            mask=mask,
+            past_kv=past_kv,
+            use_cache=use_cache,
+        )
         # Pre-norm architecture (norm before attention/FF)
-        x = x + self.attention(self.norm1(x), mask)
+        x = x + attn_out
         x = x + self.feed_forward(self.norm2(x))
-        return x
-
+        return x, present_kv
 
 class DecoderOnlyTransformer(nn.Module):
     """
@@ -281,6 +344,9 @@ def __init__(
 
         # Initialize weights
         self.apply(self._init_weights)
+        if SINUSOIDAL:
+            self.pos_embedding = SinusoidalPositionalEmbedding(dim, max_seq_len)
+
 
     def _init_weights(self, module):
         """Initialize weights following standard Transformer initialization"""
@@ -308,13 +374,18 @@ def forward(
 
         # Embed tokens
         x = self.token_embedding(input_ids)  # (batch, seq_len, dim)
+        if SINUSOIDAL:
+            x = self.pos_embedding(x)
 
         # Create causal mask (lower triangular)
-        mask = torch.tril(torch.ones(seq_len, seq_len, device=x.device))
+        if CAUSAL_MASKING:
+            mask = torch.tril(torch.ones(seq_len, seq_len, device=x.device))
+        else:
+            mask = None
 
         # Apply transformer blocks
         for block in self.blocks:
-            x = block(x, mask)
+            x, _ = block(x, mask)
 
         # Final norm and projection
         x = self.norm_final(x)
@@ -325,8 +396,8 @@ def forward(
         if targets is not None:
             # Flatten for cross-entropy
             loss = F.cross_entropy(
-                logits.view(-1, self.vocab_size),
-                targets.view(-1),
+                logits.reshape(-1, self.vocab_size),
+                targets.reshape(-1),
                 ignore_index=-1,  # Ignore padding tokens
             )
 
@@ -341,7 +412,7 @@ def generate(
         top_k: Optional[int] = None,
     ) -> torch.Tensor:
         """
-        Autoregressive generation
+        Autoregressive generation with KV caching
 
         Args:
             input_ids: Starting tokens (batch, seq_len)
@@ -351,20 +422,38 @@ def generate(
         Returns:
             Generated sequence (batch, seq_len + max_new_tokens)
         """
+        batch_size = input_ids.size(0)
+
+        # One KV cache per transformer layer
+        past_kvs = [None] * len(self.blocks)
+
         for _ in range(max_new_tokens):
-            # Crop context if too long
-            input_context = input_ids if input_ids.size(1) <= self.max_seq_len else input_ids[:, -self.max_seq_len:]
+            # Only embed the *new* token
+            x = self.token_embedding(input_ids[:, -1:])  # (batch, 1, dim)
 
-            # Forward pass
-            logits, _ = self.forward(input_context)
-            logits = logits[:, -1, :] / temperature  # Get last token logits
+            new_past_kvs = []
+
+            # Pass through transformer blocks with KV caching
+            for i, block in enumerate(self.blocks):
+                x, present_kv = block(
+                    x,
+                    past_kv=past_kvs[i],
+                    use_cache=True,
+                )
+                new_past_kvs.append(present_kv)
+
+            past_kvs = new_past_kvs
+
+            # Final norm + LM head
+            x = self.norm_final(x)
+            logits = self.lm_head(x[:, -1, :]) / temperature  # (batch, vocab)
 
             # Optional top-k sampling
             if top_k is not None:
                 values, _ = torch.topk(logits, min(top_k, logits.size(-1)))
-                logits[logits < values[:, [-1]]] = -float('Inf')
+                logits[logits < values[:, [-1]]] = -float("inf")
 
-            # Sample from distribution
+            # Sample next token
             probs = F.softmax(logits, dim=-1)
             next_token = torch.multinomial(probs, num_samples=1)
 
@@ -374,6 +463,7 @@ def generate(
         return input_ids
 
 
+
 def train_epoch(
     model: DecoderOnlyTransformer,
     dataloader: torch.utils.data.DataLoader,
@@ -397,21 +487,41 @@ def train_epoch(
     total_loss = 0.0
     num_batches = 0
 
-    # TODO: Implement training loop
+    # Implementation of training loop
     # For each batch:
-    #   1. Move data to device
-    #   2. Forward pass (get logits and loss)
-    #   3. Backward pass
-    #   4. Gradient clipping (max_norm=1.0)
-    #   5. Optimizer step
-    #   6. Zero gradients
-    #   7. Accumulate loss
+    for batch_idx, batch in enumerate(dataloader):
+        states, actions = batch
+
+        #   1. Move data to device
+        actions = actions.to(device)
+
+        input_ids = actions[:, :-1].long().to(device)
+        targets = actions[:, 1:].long().to(device)
+
+        #   2. Forward pass (get logits and loss)
+        optimizer.zero_grad(set_to_none = True)
+        _, loss = model(input_ids, targets)
+
+        #   3. Backward pass
+        loss.backward()
+
+        #   4. Gradient clipping (max_norm=1.0)
+        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
 
-    # Hint: Use torch.nn.utils.clip_grad_norm_ for gradient clipping
-    # Hint: Print progress every 100 batches
+        #   5. Optimizer step
+        optimizer.step()
 
-    raise NotImplementedError("TODO: Implement training loop")
+        #   6. Zero gradients
+        optimizer.zero_grad(set_to_none=True)
+        #   7. Accumulate loss
+        total_loss += loss.item()
+        num_batches += 1
 
+        if (batch_idx + 1) % 100 == 0:
+            avg = total_loss / num_batches if num_batches > 0 else float('nan')
+            print(f"Epoch {epoch + 1} Batch {batch_idx + 1}: avg_loss = {avg:.6f}")
+
+    return total_loss/num_batches if num_batches> 0 else 0.0
 
 def main():
     """
@@ -429,32 +539,106 @@ def main():
     max_seq_len = 50  # Maximum sequence length
     batch_size = 32
     learning_rate = 1e-4
-    num_epochs = 10
+    num_epochs = 20
+
+    project_root = Path(__file__).resolve().parents[3]
+    outputs_dir = project_root / "content" / "course" / "submissions" / "scratch-1" / "images"
 
     # Device
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print(f"Using device: {device}")
 
-    # TODO: Load dataset
-    # Use the generate_data.py script to create synthetic trajectories
-    # Load from data/trajectories.pkl
-
-    # TODO: Create model
-    # model = DecoderOnlyTransformer(...)
-
-    # TODO: Create optimizer
-    # optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
-
-    # TODO: Training loop
-    # for epoch in range(num_epochs):
-    #     train_loss = train_epoch(model, train_loader, optimizer, device, epoch)
-    #     print(f"Epoch {epoch+1}/{num_epochs} - Loss: {train_loss:.4f}")
-
-    # TODO: Save checkpoint
-    # torch.save(model.state_dict(), "checkpoints/best_model.pt")
-
-    print("TODO: Complete the main training script")
-
-
+    # Load dataset
+    df = pd.read_pickle('data/trajectories.pkl')
+    states = df['states']
+    actions = df['actions']
+    if isinstance(states, np.ndarray):
+        states = torch.tensor(states, dtype=torch.float32)
+    if isinstance(actions, np.ndarray):
+        actions = torch.tensor(actions, dtype=torch.long)
+
+    dataset = torch.utils.data.TensorDataset(states, actions)
+    num_samples = len(dataset)
+    num_train = int(0.9 * num_samples)
+    num_val = num_samples - num_train
+    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [num_train, num_val])
+
+    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
+    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
+
+    # Create model
+    model = DecoderOnlyTransformer(
+        vocab_size,
+        dim,
+        num_layers,
+        num_heads,
+        ff_hidden_dim,
+        max_seq_len
+    )
+
+    # Create optimizer
+    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
+
+    tr_loss_list = []
+
+    # Training loop
+    for epoch in range(num_epochs):
+        train_loss = train_epoch(model, train_loader, optimizer, device, epoch)
+        print(f"Epoch {epoch+1}/{num_epochs} - Loss: {train_loss:.4f}")
+        tr_loss_list.append(train_loss)
+
+    # Save checkpoint
+    torch.save(model.state_dict(), "checkpoints/best_model.pt")
+
+    # Loss curve plotting
+    epochs = list(range(1, len(tr_loss_list) + 1))
+    plt.figure()
+    plt.plot(epochs, tr_loss_list, label="train")
+    plt.xlabel("Epoch")
+    plt.ylabel("Loss")
+    plt.title("Training and Validation Loss")
+    plt.legend()
+    loss_path = str(outputs_dir / "loss_curve_sinusoidal.png")
+    plt.savefig(loss_path, dpi=150)
+    print(f"Saved loss curve to {loss_path}")
+    plt.close()
+
+    # Attention visualization
+    try:
+        model.eval()
+        sample_batch = next(iter(val_loader))
+        _, sample_actions = sample_batch
+        sample_actions = sample_actions.to(device)
+        input_ids = sample_actions[:, :-1].long()
+        with torch.no_grad():
+            _logits, _ = model(input_ids)
+
+        # pick layer and head to visualize
+        layer_idx = 0
+        head_idx = 0
+        attn_tensor = None
+        if 0 <= layer_idx < len(model.blocks):
+            attn_module = model.blocks[layer_idx].attention
+            attn_tensor = getattr(attn_module, "last_attn", None)
+
+        if attn_tensor is None:
+            print("No attention stored for visualization.")
+        else:
+            # attn_tensor shape: (batch, num_heads, seq_len, seq_len)
+            attn_img = attn_tensor[0, head_idx].numpy()  # visualize first sample, chosen head
+            plt.figure(figsize=(6,6))
+            plt.imshow(attn_img, aspect='auto', cmap='viridis')
+            plt.colorbar()
+            plt.xlabel("Key position")
+            plt.ylabel("Query position")
+            plt.title(f"Attention heatmap")
+            attn_path = str(outputs_dir / "attention_maps_sinusoidal.png")
+            plt.tight_layout()
+            plt.savefig(attn_path, dpi=150)
+            plt.close()
+            print(f"Saved attention heatmap to {attn_path}")
+    except Exception as e:
+        print(f"Could not create attention visualization: {e}")
+    
 if __name__ == "__main__":
     main()
diff --git a/src/assignments/scratch-1/checkpoints/best_model.pt b/src/assignments/scratch-1/checkpoints/best_model.pt
new file mode 100644
index 00000000..04c6e4e4
Binary files /dev/null and b/src/assignments/scratch-1/checkpoints/best_model.pt differ
diff --git a/src/assignments/scratch-1/data/trajectories.pkl b/src/assignments/scratch-1/data/trajectories.pkl
new file mode 100644
index 00000000..2d511ef3
Binary files /dev/null and b/src/assignments/scratch-1/data/trajectories.pkl differ
diff --git a/src/assignments/scratch-1/data/trajectories_test.pkl b/src/assignments/scratch-1/data/trajectories_test.pkl
new file mode 100644
index 00000000..f885be12
Binary files /dev/null and b/src/assignments/scratch-1/data/trajectories_test.pkl differ
diff --git a/src/assignments/scratch-1/generate_data.py b/src/assignments/scratch-1/generate_data.py
index 3a346286..d02d2b8a 100644
--- a/src/assignments/scratch-1/generate_data.py
+++ b/src/assignments/scratch-1/generate_data.py
@@ -8,7 +8,17 @@
 - 10,000 trajectories
 - 50 timesteps per trajectory
 - 7-DOF joint angles + 3D end-effector position (10 dimensions total)
-- Actions discretized into 256 bins
+- Actions encode direction + magnitude toward target (256 bins)
+
+Action Encoding (structured and learnable):
+- Direction: 8 octants (±X, ±Y, ±Z combinations) → 3 bits
+- Magnitude: Distance to target in 32 bins → 5 bits
+- Total: 8 * 32 = 256 discrete actions
+
+This encoding makes actions LEARNABLE from state because:
+- Model sees current position and target
+- Can compute error vector
+- Can predict corresponding action
 
 Usage:
     python generate_data.py --num_trajectories 10000 --seq_length 50 --output data/trajectories.pkl
@@ -45,19 +55,22 @@ def generate_trajectory(
     start_joints: np.ndarray,
     target_pos: np.ndarray,
     seq_length: int = 50,
-    noise_std: float = 0.05,
+    noise_std: float = 0.02,
 ) -> Tuple[np.ndarray, np.ndarray]:
     """
     Generate a single trajectory from start configuration to target position
 
+    IMPROVED: Actions now encode structured motion toward target, making them
+    learnable from state. Action represents direction and magnitude of motion.
+
     Args:
         start_joints: Initial joint configuration (7,)
         target_pos: Target end-effector position (3,)
         seq_length: Number of timesteps
-        noise_std: Standard deviation of noise to add
+        noise_std: Standard deviation of noise to add (reduced for learnability)
     Returns:
         states: Array of shape (seq_length, 10) with [joint_angles, ee_pos]
-        actions: Array of shape (seq_length,) with discretized joint deltas
+        actions: Array of shape (seq_length,) with structured action encoding
     """
     states = np.zeros((seq_length, 10))  # 7 joints + 3 ee_pos
     actions = np.zeros(seq_length, dtype=np.int64)
@@ -72,31 +85,61 @@ def generate_trajectory(
         states[t, :7] = current_joints
         states[t, 7:] = ee_pos
 
-        # Compute simple gradient-based control toward target
-        # This is NOT real inverse kinematics, just a synthetic trajectory generator
+        # Compute error vector toward target (THIS IS THE KEY TO LEARNABILITY)
         error = target_pos - ee_pos
-        joint_delta = 0.02 * np.random.randn(7)  # Random exploration
+        error_magnitude = np.linalg.norm(error)
+
+        # Normalize error to get direction
+        if error_magnitude > 1e-6:
+            error_dir = error / error_magnitude
+        else:
+            error_dir = np.zeros(3)
+
+        # === STRUCTURED ACTION ENCODING ===
+        # Action encodes: direction (8 octants) + magnitude (32 levels) = 256 bins
+        # This is LEARNABLE because model sees state → can compute error → can predict action
+
+        # 1. Encode direction into one of 8 octants (3 bits)
+        octant = (
+            (4 if error_dir[0] > 0 else 0) +
+            (2 if error_dir[1] > 0 else 0) +
+            (1 if error_dir[2] > 0 else 0)
+        )
+
+        # 2. Encode magnitude into 32 bins (5 bits)
+        # Clip magnitude to [0, 3.0] (max workspace reach)
+        magnitude_normalized = np.clip(error_magnitude / 3.0, 0.0, 1.0)
+        magnitude_bin = int(magnitude_normalized * 31)
+
+        # 3. Combine: action = octant * 32 + magnitude_bin
+        action_deterministic = octant * 32 + magnitude_bin
+
+        # Add small noise to action for regularization (±2 bins)
+        action_noise = np.random.randint(-2, 3)
+        action_discrete = np.clip(action_deterministic + action_noise, 0, 255)
+        actions[t] = action_discrete
 
-        # Add gradient component (simplified)
-        joint_delta[:3] += 0.01 * error  # First 3 joints affect position most
+        # === GENERATE MOTION TOWARD TARGET ===
+        # Joint motion is now MORE deterministic toward target
+        joint_delta = np.zeros(7)
 
-        # Add noise
+        # Use error to drive joint motion (simplified IK)
+        # First 3 joints have most effect on position
+        if error_magnitude > 1e-6:
+            joint_delta[:3] = 0.05 * error  # Stronger gradient component
+            # Distal joints (3-6) get smaller updates based on dominant error direction
+            joint_delta[3:] = 0.02 * np.max(np.abs(error))  # Scale by max error component
+
+        # Add small exploration noise (reduced from 0.05)
         joint_delta += noise_std * np.random.randn(7)
 
-        # Clip deltas
-        joint_delta = np.clip(joint_delta, -0.1, 0.1)
+        # Clip deltas to reasonable range
+        joint_delta = np.clip(joint_delta, -0.15, 0.15)
 
         # Update joints
         current_joints += joint_delta
         current_joints = np.clip(current_joints, -np.pi, np.pi)  # Joint limits
 
-        # Discretize action (joint delta) into 256 bins
-        # Map from [-0.1, 0.1] to [0, 255]
-        action_continuous = np.mean(joint_delta)  # Average delta for simplicity
-        action_discrete = int((action_continuous + 0.1) / 0.2 * 255)
-        action_discrete = np.clip(action_discrete, 0, 255)
-        actions[t] = action_discrete
-
     return states, actions
 
 
diff --git a/tests/README.md b/tests/README.md
new file mode 100644
index 00000000..30b1eecb
--- /dev/null
+++ b/tests/README.md
@@ -0,0 +1,276 @@
+# VLA Foundations Test Suite
+
+This directory contains both **public** and **internal** tests for course assignments.
+
+## Directory Structure
+
+```
+tests/
+├── conftest.py                    # Shared pytest configuration
+├── public/                        # Tests students can see and run
+│   ├── __init__.py
+│   └── test_scratch1_basic.py    # Basic validation tests
+└── internal/                      # Internal grading tests (NEVER public)
+    ├── __init__.py
+    ├── conftest.py                # Internal test fixtures
+    ├── fixtures/                  # Gold standard data
+    │   └── scratch1_gold.pt       # Reference tensors
+    └── test_scratch1_rigor.py     # Rigorous grading tests
+```
+
+## Test Categories
+
+### Public Tests (`tests/public/`)
+
+**Purpose**: Basic validation that students can run locally
+
+**What they test**:
+- Provided (non-TODO) components work correctly
+- Data generation functions
+- Model instantiation
+- Basic shape validation
+
+**Usage**:
+```bash
+# Run all public tests
+pytest tests/public/ -v
+
+# Run specific test file
+pytest tests/public/test_scratch1_basic.py -v
+
+# Run specific test
+pytest tests/public/test_scratch1_basic.py::TestDataGeneration::test_forward_kinematics -v
+```
+
+Students can run these tests to verify:
+1. Their environment is set up correctly
+2. Provided starter code works as expected
+3. Basic integration is correct
+
+### Internal Tests (`tests/internal/`)
+
+**Purpose**: Rigorous validation for grading (instructor only)
+
+**What they test**:
+- Gradient leak detection (frozen parameters)
+- Latent fidelity (comparison against gold standards)
+- Training convergence
+- Edge cases and robustness
+- Numerical stability
+
+**Usage**:
+```bash
+# Inject solutions first (automatic via conftest.py)
+pytest tests/internal/test_scratch1_rigor.py -v
+
+# Run specific test category
+pytest tests/internal/ -m gradient -v    # Only gradient tests
+pytest tests/internal/ -m fidelity -v    # Only fidelity tests
+pytest tests/internal/ -m training -v    # Only training tests
+```
+
+**Note**: Internal tests automatically inject solutions before running and reset after completion.
+
+## Test Markers
+
+Tests use pytest markers for categorization:
+
+```bash
+# Run only public tests
+pytest -m public -v
+
+# Run only internal tests
+pytest -m internal -v
+
+# Run gradient leak tests
+pytest -m gradient -v
+
+# Run fidelity tests
+pytest -m fidelity -v
+
+# Run training convergence tests
+pytest -m training -v
+```
+
+Available markers (defined in `pytest.ini`):
+- `public`: Tests students can see and run
+- `internal`: Internal grading tests (never public)
+- `rigor`: Rigorous validation tests
+- `gradient`: Gradient flow validation
+- `fidelity`: Output quality validation
+- `training`: Training convergence tests
+
+## Running Tests
+
+### For Students (Public Tests Only)
+
+```bash
+# Install pytest
+pip install pytest torch numpy
+
+# Run public tests
+cd /path/to/vla-foundations
+pytest tests/public/ -v
+```
+
+### For Instructors (All Tests)
+
+```bash
+# Setup (one time)
+pip install pytest torch numpy
+
+# Run public tests
+pytest tests/public/ -v
+
+# Run internal tests (with solutions)
+# Solutions are automatically injected by conftest.py
+pytest tests/internal/ -v
+
+# Run all tests
+pytest tests/ -v
+
+# Run with coverage
+pytest tests/ --cov=src/assignments --cov-report=html
+```
+
+## Writing New Tests
+
+### Public Test Template
+
+```python
+import pytest
+
+# Mark as public
+pytestmark = pytest.mark.public
+
+class TestMyComponent:
+    """Test suite for MyComponent."""
+
+    def test_basic_functionality(self):
+        """Test basic functionality."""
+        # Test provided (non-TODO) components only
+        assert True
+
+    def test_shapes(self):
+        """Test expected shapes."""
+        # Validate tensor shapes
+        assert True
+```
+
+### Internal Test Template
+
+```python
+import pytest
+
+# Mark as internal
+pytestmark = [pytest.mark.internal, pytest.mark.rigor]
+
+@pytest.mark.gradient
+def test_gradient_flow(create_solution_model):
+    """Test gradient flow through the model."""
+    model = create_solution_model()
+
+    # Test with complete solution
+    assert True
+
+@pytest.mark.fidelity
+def test_output_quality(create_solution_model, load_gold_standard):
+    """Compare output against gold standard."""
+    gold = load_gold_standard('my_gold.pt')
+
+    # Compare student output to gold
+    assert True
+```
+
+## Fixtures
+
+### Shared Fixtures (`tests/conftest.py`)
+
+- `project_root`: Path to project root
+- `gold_standards_dir`: Path to gold standard fixtures
+- `load_gold_standard(filename)`: Load gold standard tensor
+- `small_transformer_config`: Small model config for fast tests
+- `sample_batch`: Sample batch for testing
+
+### Internal Fixtures (`tests/internal/conftest.py`)
+
+- `create_solution_model(**kwargs)`: Factory for creating solution models
+- `training_setup`: Complete training environment (model, optimizer, dataloader)
+
+## Gold Standards
+
+Gold standards are reference outputs used for rigorous testing:
+
+```
+tests/internal/fixtures/
+└── scratch1_gold_output.pt  # Expected projector output
+```
+
+Gold standards contain:
+```python
+{
+    'input': torch.Tensor,   # Test input
+    'output': torch.Tensor,  # Expected output
+    'metadata': {            # Additional context
+        'created': '2024-01-23',
+        'model_config': {...}
+    }
+}
+```
+
+## Continuous Integration
+
+Tests run automatically on:
+- PR creation (public tests only)
+- PR merge to staging/main
+- Manual trigger
+
+See `.github/workflows/test.yml` for CI configuration.
+
+## Troubleshooting
+
+### Tests not discovered
+```bash
+# Verify pytest can find tests
+pytest --collect-only
+
+# Check pytest.ini configuration
+cat pytest.ini
+```
+
+### Import errors
+```bash
+# Verify Python path
+pytest tests/public/ -v --tb=short
+
+# Check conftest.py adds correct paths
+```
+
+### Fixture not found
+```bash
+# List available fixtures
+pytest --fixtures
+
+# Check fixture scope and location
+```
+
+### Internal tests fail
+```bash
+# Verify solutions are injected
+python scripts/manage_solutions.py --list
+
+# Manually inject and test
+python scripts/manage_solutions.py --inject scratch-1
+pytest tests/internal/test_scratch1_rigor.py -v
+python scripts/manage_solutions.py --reset scratch-1
+```
+
+## Best Practices
+
+1. **Keep public tests simple**: Only test provided components
+2. **Make internal tests comprehensive**: Cover edge cases and robustness
+3. **Use descriptive test names**: `test_frozen_dinov2_no_gradients` not `test_1`
+4. **Add docstrings**: Explain what each test validates and why
+5. **Use markers**: Tag tests appropriately for easy filtering
+6. **Keep tests fast**: Use small models and datasets where possible
+7. **Isolate tests**: Each test should be independent
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 00000000..89fe8c8f
--- /dev/null
+++ b/tests/__init__.py
@@ -0,0 +1,8 @@
+"""
+VLA Foundations Test Suite
+
+This package contains both public and internal tests for assignments.
+
+- tests/public/: Tests that students can see and run (basic validation)
+- tests/internal/: Internal grading tests (rigorous validation, never public)
+"""
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 00000000..c55a8521
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,134 @@
+"""
+Shared pytest configuration and fixtures.
+
+This module provides common fixtures and utilities used across both
+public and internal test suites.
+"""
+
+import pytest
+import torch
+import subprocess
+import sys
+from pathlib import Path
+
+
+@pytest.fixture(scope="session")
+def project_root():
+    """Return the project root directory."""
+    return Path(__file__).parent.parent
+
+
+@pytest.fixture(scope="session")
+def gold_standards_dir(project_root):
+    """Return the directory containing gold standard fixtures."""
+    return project_root / "tests" / "internal" / "fixtures"
+
+
+@pytest.fixture
+def load_gold_standard(gold_standards_dir):
+    """
+    Fixture factory for loading gold standard tensors.
+
+    Usage:
+        def test_something(load_gold_standard):
+            data = load_gold_standard('scratch1_gold_output.pt')
+    """
+    def _load(filename: str):
+        filepath = gold_standards_dir / filename
+        if not filepath.exists():
+            pytest.skip(f"Gold standard file not found: {filepath}")
+        return torch.load(filepath)
+
+    return _load
+
+
+@pytest.fixture(scope="session", autouse=True)
+def inject_solutions_for_internal_tests(request):
+    """
+    Automatically inject solutions before internal tests run.
+    Reset after all tests complete.
+
+    Only activates when running tests from tests/internal/ directory
+    AND when dev_utils.py exists (i.e., in instructor environment).
+    """
+    # Check if we're running internal tests
+    test_items = [item for item in request.session.items]
+    internal_tests = [item for item in test_items if "tests/internal" in str(item.fspath)]
+
+    if not internal_tests:
+        # Not running internal tests, skip solution injection
+        yield
+        return
+
+    # Check if dev_utils.py exists (instructor environment)
+    dev_utils_path = Path(__file__).parent.parent / "scripts" / "dev_utils.py"
+    if not dev_utils_path.exists():
+        # Skip injection in student/grading environment
+        print("\n=== Skipping solution injection (dev_utils.py not found) ===")
+        yield
+        return
+
+    # Inject solutions
+    print("\n=== Injecting solutions for internal tests ===")
+    result = subprocess.run(
+        [sys.executable, "scripts/dev_utils.py", "--inject", "scratch-1"],
+        capture_output=True,
+        text=True
+    )
+
+    if result.returncode != 0:
+        pytest.exit(f"Failed to inject solutions: {result.stderr}")
+
+    print(result.stdout)
+
+    yield
+
+    # Reset after tests
+    print("\n=== Resetting to starter code ===")
+    result = subprocess.run(
+        [sys.executable, "scripts/dev_utils.py", "--reset", "scratch-1"],
+        capture_output=True,
+        text=True
+    )
+
+    if result.returncode != 0:
+        print(f"Warning: Failed to reset solutions: {result.stderr}")
+    else:
+        print(result.stdout)
+
+
+@pytest.fixture
+def small_transformer_config():
+    """
+    Configuration for a small transformer model (for fast tests).
+    """
+    return {
+        'vocab_size': 256,
+        'dim': 128,
+        'num_layers': 2,
+        'num_heads': 4,
+        'ff_hidden_dim': 512,
+        'max_seq_len': 50,
+        'dropout': 0.0
+    }
+
+
+@pytest.fixture
+def sample_batch():
+    """
+    Generate a small sample batch for testing.
+    """
+    batch_size = 4
+    seq_len = 20
+    vocab_size = 256
+
+    input_ids = torch.randint(0, vocab_size, (batch_size, seq_len))
+    targets = torch.randint(0, vocab_size, (batch_size, seq_len))
+
+    return {
+        'input_ids': input_ids,
+        'targets': targets,
+        'batch_size': batch_size,
+        'seq_len': seq_len,
+        'vocab_size': vocab_size
+    }
diff --git a/tests/internal/__init__.py b/tests/internal/__init__.py
new file mode 100644
index 00000000..952bb906
--- /dev/null
+++ b/tests/internal/__init__.py
@@ -0,0 +1 @@
+"""Internal grading tests for VLA Foundations assignments."""
diff --git a/tests/internal/test_scratch1_rigor.py b/tests/internal/test_scratch1_rigor.py
new file mode 100644
index 00000000..4b3d8d11
--- /dev/null
+++ b/tests/internal/test_scratch1_rigor.py
@@ -0,0 +1,431 @@
+"""
+Rigorous Internal Tests for Scratch-1: The Transformer Backbone
+CSCI 7000 - VLA Foundations
+
+These tests are used for automated grading and are NOT visible to students.
+"""
+
+import pytest
+import torch
+import torch.nn as nn
+import sys
+from pathlib import Path
+
+# Add src to path for imports
+sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src" / "assignments" / "scratch-1"))
+
+try:
+    from backbone import (
+        RMSNorm,
+        CausalSelfAttention,
+        DecoderOnlyTransformer,
+        RotaryPositionalEmbedding,
+    )
+    IMPORT_SUCCESS = True
+except Exception as e:
+    IMPORT_SUCCESS = False
+    IMPORT_ERROR = str(e)
+
+
+def _make_causal_attention(dim, num_heads, max_seq_len, dropout=0.0):
+    """Instantiate CausalSelfAttention with flexible API.
+
+    The starter code defines CausalSelfAttention(dim, num_heads, dropout) without
+    max_seq_len.  Some students add it, some don't.  Try both signatures so we
+    don't penalise students for following the starter code exactly.
+    """
+    try:
+        return CausalSelfAttention(dim=dim, num_heads=num_heads, max_seq_len=max_seq_len, dropout=dropout)
+    except TypeError:
+        return CausalSelfAttention(dim=dim, num_heads=num_heads, dropout=dropout)
+
+
+def _call_attention(attn, x):
+    """Call CausalSelfAttention flexibly.
+
+    Some students' forward() requires a mask argument (they moved mask creation
+    to the Transformer level).  Try without mask first, then with an explicit
+    causal mask.  Also handle implementations that return extra values
+    (attention weights, KV-cache) beyond the output tensor.
+    """
+    seq_len = x.shape[1]
+    causal_mask = torch.tril(torch.ones(seq_len, seq_len, device=x.device))
+
+    # Try with mask first (more specific), then without.
+    # Catch broadly: students may crash on missing mask with AttributeError,
+    # TypeError, RuntimeError, etc.
+    for args in [(x, causal_mask), (x,)]:
+        try:
+            result = attn(*args)
+            # Handle multi-value returns (output, attn_weights, kv_cache, ...)
+            if isinstance(result, tuple):
+                return result[0]
+            return result
+        except Exception:
+            continue
+    raise RuntimeError("CausalSelfAttention forward() could not be called with (x, mask) or (x)")
+
+
+def _call_model(model, input_ids, targets=None):
+    """Call DecoderOnlyTransformer.forward() flexibly.
+
+    Some students return extra values (attention weights, KV-cache) beyond
+    (logits, loss).  Always return (logits, loss).
+    """
+    if targets is not None:
+        result = model(input_ids, targets)
+    else:
+        result = model(input_ids)
+
+    if isinstance(result, tuple):
+        return result[0], result[1]
+    return result, None
+
+
+@pytest.fixture
+def device():
+    """Get available device"""
+    return torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+@pytest.fixture
+def small_config():
+    """Small model config for fast testing"""
+    return {
+        "vocab_size": 256,
+        "dim": 128,
+        "num_layers": 2,
+        "num_heads": 4,
+        "ff_hidden_dim": 256,
+        "max_seq_len": 32,
+        "dropout": 0.0,  # Disable for testing
+    }
+
+
+# ============================================================================
+# IMPORT TEST (Critical - must pass for any other test to run)
+# ============================================================================
+
+@pytest.mark.rigor
+def test_import_success():
+    """Test that student code can be imported"""
+    assert IMPORT_SUCCESS, f"Failed to import student code: {IMPORT_ERROR if not IMPORT_SUCCESS else 'N/A'}"
+
+
+# ============================================================================
+# RMSNORM TESTS (10 points)
+# ============================================================================
+
+@pytest.mark.rigor
+@pytest.mark.skipif(not IMPORT_SUCCESS, reason="Import failed")
+def test_rmsnorm_implementation():
+    """Test RMSNorm produces correct outputs"""
+    batch_size, seq_len, dim = 2, 10, 64
+
+    # Initialize RMSNorm
+    rmsnorm = RMSNorm(dim=dim)
+
+    # Check that scale parameter exists and is learnable
+    assert hasattr(rmsnorm, 'scale'), "RMSNorm must have 'scale' parameter"
+    assert rmsnorm.scale is not None, "RMSNorm scale parameter cannot be None"
+    assert isinstance(rmsnorm.scale, nn.Parameter), "scale must be nn.Parameter"
+    assert rmsnorm.scale.shape == (dim,), f"scale shape should be ({dim},), got {rmsnorm.scale.shape}"
+
+    # Test forward pass
+    x = torch.randn(batch_size, seq_len, dim)
+    output = rmsnorm(x)
+
+    # Check output shape
+    assert output.shape == x.shape, f"Output shape {output.shape} != input shape {x.shape}"
+
+    # Check that output is not NaN or Inf
+    assert torch.isfinite(output).all(), "RMSNorm output contains NaN or Inf"
+
+    # Verify normalization: RMS should be approximately 1.0 (before scaling)
+    # After scaling by learnable parameter, mean RMS should still be reasonable
+    rms = torch.sqrt((output ** 2).mean(dim=-1))
+    assert (rms > 0.1).all() and (rms < 10).all(), f"RMS values unreasonable: {rms.mean()}"
+
+
+@pytest.mark.rigor
+@pytest.mark.skipif(not IMPORT_SUCCESS, reason="Import failed")
+def test_rmsnorm_numerical_stability():
+    """Test RMSNorm handles edge cases"""
+    dim = 64
+    rmsnorm = RMSNorm(dim=dim)
+
+    # Test with very small values
+    x_small = torch.randn(1, 10, dim) * 1e-6
+    output_small = rmsnorm(x_small)
+    assert torch.isfinite(output_small).all(), "RMSNorm fails with small inputs"
+
+    # Test with very large values
+    x_large = torch.randn(1, 10, dim) * 1e6
+    output_large = rmsnorm(x_large)
+    assert torch.isfinite(output_large).all(), "RMSNorm fails with large inputs"
+
+
+# ============================================================================
+# CAUSAL ATTENTION TESTS (15 points)
+# ============================================================================
+
+@pytest.mark.rigor
+@pytest.mark.gradient
+@pytest.mark.skipif(not IMPORT_SUCCESS, reason="Import failed")
+def test_causal_mask_leakage():
+    """CRITICAL: Test that causal mask prevents future token leakage"""
+    dim = 128
+    num_heads = 4
+    max_seq_len = 16
+
+    attn = _make_causal_attention(dim=dim, num_heads=num_heads, max_seq_len=max_seq_len, dropout=0.0)
+    attn.eval()  # Disable dropout
+
+    batch_size = 2
+    seq_len = 12
+
+    # Create input where position information is critical
+    # Each position has a unique value, so we can detect if future info leaks
+    x = torch.arange(seq_len).float().view(1, seq_len, 1).expand(batch_size, seq_len, dim)
+    x = x + torch.randn(batch_size, seq_len, dim) * 0.1  # Add small noise
+
+    output = _call_attention(attn, x)
+
+    # Check output shape
+    assert output.shape == x.shape, f"Attention output shape mismatch: {output.shape} != {x.shape}"
+
+    # Key test: Output at position i should NOT depend on positions > i
+    # We'll test this by checking that changing future tokens doesn't affect past outputs
+
+    # Run forward pass twice: once with original input, once with modified future
+    x_modified = x.clone()
+    x_modified[:, seq_len//2:, :] = torch.randn_like(x_modified[:, seq_len//2:, :]) * 1000  # Drastically change future
+
+    with torch.no_grad():
+        output_original = _call_attention(attn, x)
+        output_modified = _call_attention(attn, x_modified)
+
+    # Outputs for positions 0 to seq_len//2-1 should be identical (within numerical precision)
+    early_positions = slice(0, seq_len//2)
+    diff = (output_original[:, early_positions, :] - output_modified[:, early_positions, :]).abs().max()
+
+    assert diff < 1e-4, f"Causal mask is leaking! Early positions changed by {diff} when future was modified. " \
+                        f"Did you apply the mask BEFORE softmax?"
+
+
+@pytest.mark.rigor
+@pytest.mark.skipif(not IMPORT_SUCCESS, reason="Import failed")
+def test_causal_attention_shape_preservation():
+    """Test that attention preserves tensor shapes"""
+    dim = 128
+    num_heads = 4
+    batch_size, seq_len = 2, 16
+
+    attn = _make_causal_attention(dim=dim, num_heads=num_heads, max_seq_len=32)
+    x = torch.randn(batch_size, seq_len, dim)
+
+    output = _call_attention(attn, x)
+    assert output.shape == (batch_size, seq_len, dim), \
+        f"Attention output shape {output.shape} != expected {(batch_size, seq_len, dim)}"
+
+
+# ============================================================================
+# ROPE TESTS (15 points)
+# ============================================================================
+
+@pytest.mark.rigor
+@pytest.mark.skipif(not IMPORT_SUCCESS, reason="Import failed")
+def test_rope_embeddings():
+    """Test RoPE is applied correctly"""
+    head_dim = 32
+    max_seq_len = 32
+    rope = RotaryPositionalEmbedding(dim=head_dim, max_seq_len=max_seq_len)
+
+    batch_size, num_heads, seq_len = 2, 4, 16
+    q = torch.randn(batch_size, num_heads, seq_len, head_dim)
+    k = torch.randn(batch_size, num_heads, seq_len, head_dim)
+
+    q_rot, k_rot = rope(q, k)
+
+    # Check shapes preserved
+    assert q_rot.shape == q.shape, f"RoPE changed Q shape: {q_rot.shape} != {q.shape}"
+    assert k_rot.shape == k.shape, f"RoPE changed K shape: {k_rot.shape} != {k.shape}"
+
+    # Check that rotation actually happened (values changed)
+    assert not torch.allclose(q, q_rot, atol=1e-6), "RoPE did not modify Q (are you applying it?)"
+    assert not torch.allclose(k, k_rot, atol=1e-6), "RoPE did not modify K (are you applying it?)"
+
+    # Check no NaNs
+    assert torch.isfinite(q_rot).all(), "RoPE produced NaN/Inf in Q"
+    assert torch.isfinite(k_rot).all(), "RoPE produced NaN/Inf in K"
+
+
+# ============================================================================
+# TRAINING CONVERGENCE TEST (10 points)
+# ============================================================================
+
+@pytest.mark.rigor
+@pytest.mark.training
+@pytest.mark.skipif(not IMPORT_SUCCESS, reason="Import failed")
+@pytest.mark.timeout(300)  # 5 minute timeout
+def test_training_convergence(small_config, device):
+    """Test that model can train and loss decreases"""
+
+    # Create small dataset
+    num_samples = 100
+    seq_len = small_config["max_seq_len"]
+    vocab_size = small_config["vocab_size"]
+
+    # Generate random sequences (simulating robot trajectories)
+    data = torch.randint(0, vocab_size, (num_samples, seq_len))
+
+    # Create model
+    model = DecoderOnlyTransformer(**small_config).to(device)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
+
+    # Train for a few steps
+    model.train()
+    initial_losses = []
+    final_losses = []
+
+    # Initial loss (first 5 batches)
+    for i in range(5):
+        batch = data[i:i+1].to(device)
+        input_ids = batch[:, :-1]
+        targets = batch[:, 1:]
+
+        logits, loss = _call_model(model, input_ids, targets)
+        initial_losses.append(loss.item())
+
+    # Train for 1000 steps
+    for step in range(1000):
+        idx = step % num_samples
+        batch = data[idx:idx+1].to(device)
+        input_ids = batch[:, :-1]
+        targets = batch[:, 1:]
+
+        optimizer.zero_grad()
+        logits, loss = _call_model(model, input_ids, targets)
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+        optimizer.step()
+
+    # Final loss (last 5 batches)
+    model.eval()
+    with torch.no_grad():
+        for i in range(5):
+            batch = data[i:i+1].to(device)
+            input_ids = batch[:, :-1]
+            targets = batch[:, 1:]
+
+            logits, loss = _call_model(model, input_ids, targets)
+            final_losses.append(loss.item())
+
+    initial_loss = sum(initial_losses) / len(initial_losses)
+    final_loss = sum(final_losses) / len(final_losses)
+
+    # Check that loss decreased by at least 20% of the initial loss
+    drop_pct = (initial_loss - final_loss) / initial_loss
+    assert drop_pct > 0.20, \
+        f"Loss did not decrease by at least 20%. Initial: {initial_loss:.3f}, Final: {final_loss:.3f} " \
+        f"({drop_pct*100:.1f}% drop). Check your training loop implementation."
+
+    return initial_loss, final_loss  # Return for partial credit calculation
+
+
+# ============================================================================
+# MODEL ARCHITECTURE TESTS
+# ============================================================================
+
+@pytest.mark.rigor
+@pytest.mark.skipif(not IMPORT_SUCCESS, reason="Import failed")
+def test_model_forward_pass(small_config, device):
+    """Test that model forward pass works end-to-end"""
+    model = DecoderOnlyTransformer(**small_config).to(device)
+
+    batch_size = 2
+    seq_len = 16
+    input_ids = torch.randint(0, small_config["vocab_size"], (batch_size, seq_len)).to(device)
+    targets = torch.randint(0, small_config["vocab_size"], (batch_size, seq_len)).to(device)
+
+    # Test forward with targets (training mode)
+    logits, loss = _call_model(model, input_ids, targets)
+
+    assert logits.shape == (batch_size, seq_len, small_config["vocab_size"]), \
+        f"Logits shape {logits.shape} incorrect"
+    assert loss is not None, "Loss should not be None when targets provided"
+    assert torch.isfinite(loss), "Loss is NaN or Inf"
+
+    # Test forward without targets (inference mode)
+    logits_only, loss_only = _call_model(model, input_ids)
+    assert logits_only.shape == (batch_size, seq_len, small_config["vocab_size"]), \
+        f"Logits shape {logits_only.shape} incorrect"
+    assert loss_only is None, "Loss should be None when no targets provided"
+
+
+@pytest.mark.rigor
+@pytest.mark.skipif(not IMPORT_SUCCESS, reason="Import failed")
+def test_model_has_trainable_parameters(small_config):
+    """Test that model has trainable parameters"""
+    model = DecoderOnlyTransformer(**small_config)
+
+    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    assert trainable_params > 0, "Model has no trainable parameters!"
+
+    # Should have at least a few hundred thousand parameters for this config
+    assert trainable_params > 100_000, \
+        f"Model has suspiciously few parameters ({trainable_params}). Something may be wrong."
+
+
+# ============================================================================
+# CODE QUALITY CHECKS
+# ============================================================================
+
+@pytest.mark.rigor
+def test_no_syntax_errors():
+    """Test that the code has no syntax errors"""
+    # If we got here, import succeeded, so syntax is valid
+    assert IMPORT_SUCCESS, "Code has syntax errors"
+
+
+@pytest.mark.rigor
+@pytest.mark.skipif(not IMPORT_SUCCESS, reason="Import failed")
+def test_no_todos_left():
+    """Check that student didn't leave TODO comments with 'pass' statements"""
+    import inspect
+
+    # Check RMSNorm.forward
+    source = inspect.getsource(RMSNorm.forward)
+    assert "raise NotImplementedError" not in source, \
+        "RMSNorm.forward still has NotImplementedError - did you implement it?"
+
+    # Check CausalSelfAttention.forward
+    source = inspect.getsource(CausalSelfAttention.forward)
+    assert "raise NotImplementedError" not in source, \
+        "CausalSelfAttention.forward still has NotImplementedError"
+
+
+# ============================================================================
+# HELPER FUNCTIONS FOR PARTIAL CREDIT
+# ============================================================================
+
+def calculate_partial_credit_for_training(initial_loss: float, final_loss: float) -> tuple[int, str]:
+    """
+    Calculate partial credit for training convergence
+    Returns: (points_earned, feedback)
+    """
+    # Full credit: final_loss < 2.2
+    # Partial credit: final_loss < 3.0
+    # No credit: final_loss >= 3.0 or no improvement
+
+    improvement = initial_loss - final_loss
+
+    if final_loss < 2.2:
+        return 10, f"✅ Excellent! Loss converged to {final_loss:.3f}"
+    elif final_loss < 3.0 and improvement > 0.3:
+        points = int(10 * (3.0 - final_loss) / 0.8)  # Linear scaling
+        return points, f"⚠️ Partial credit: Loss is {final_loss:.3f} (target < 2.2). Shows learning but needs improvement."
+    elif improvement > 0.1:
+        return 3, f"⚠️ Minimal credit: Loss decreased slightly ({initial_loss:.3f} → {final_loss:.3f}) but not enough."
+    else:
+        return 0, f"❌ Loss did not improve ({initial_loss:.3f} → {final_loss:.3f}). Check your training loop."
diff --git a/tests/public/__init__.py b/tests/public/__init__.py
new file mode 100644
index 00000000..d0e23989
--- /dev/null
+++ b/tests/public/__init__.py
@@ -0,0 +1,6 @@
+"""
+Public tests that students can see and run.
+
+These tests validate the provided (non-TODO) components to ensure
+the assignment starter code works correctly.
+"""
diff --git a/tests/public/test_scratch1_basic.py b/tests/public/test_scratch1_basic.py
new file mode 100644
index 00000000..4dc8a160
--- /dev/null
+++ b/tests/public/test_scratch1_basic.py
@@ -0,0 +1,294 @@
+"""
+Public tests for Scratch-1 assignment (migrated from test_scratch1.py).
+
+Tests the provided (non-TODO) components to ensure the assignment starter code works correctly.
+Students can run these tests to validate their environment and provided components.
+"""
+
+import pytest
+import torch
+import torch.nn as nn
+import numpy as np
+import sys
+from pathlib import Path
+
+# Add scratch-1 directory to path
+scratch1_dir = Path(__file__).parent.parent.parent / "src" / "assignments" / "scratch-1"
+sys.path.insert(0, str(scratch1_dir))
+
+# Import modules from assignment
+from backbone import (
+    RotaryPositionalEmbedding,
+    FeedForward,
+    TransformerBlock,
+    DecoderOnlyTransformer,
+)
+from generate_data import (
+    forward_kinematics_7dof,
+    generate_trajectory,
+    generate_dataset,
+    create_dataloaders,
+)
+
+# Mark all tests in this file as public
+pytestmark = pytest.mark.public
+
+
+class TestDataGeneration:
+    """Test suite for data generation utilities."""
+
+    def test_forward_kinematics(self):
+        """Test that forward kinematics works correctly."""
+        joint_angles = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7])
+        ee_pos = forward_kinematics_7dof(joint_angles)
+
+        assert ee_pos.shape == (3,), f"Expected shape (3,), got {ee_pos.shape}"
+        assert np.all(np.isfinite(ee_pos)), "FK output contains non-finite values"
+
+    def test_trajectory_generation(self):
+        """Test that trajectory generation produces valid outputs."""
+        start_joints = np.random.uniform(-np.pi/2, np.pi/2, size=7)
+        target_pos = np.array([1.0, 0.5, 0.3])
+        states, actions = generate_trajectory(start_joints, target_pos, seq_length=50)
+
+        assert states.shape == (50, 10), f"Expected states shape (50, 10), got {states.shape}"
+        assert actions.shape == (50,), f"Expected actions shape (50,), got {actions.shape}"
+        assert actions.dtype == np.int64, f"Expected actions dtype int64, got {actions.dtype}"
+        assert np.all((actions >= 0) & (actions < 256)), "Actions out of range [0, 255]"
+
+    def test_dataset_generation(self):
+        """Test that dataset generation works correctly."""
+        dataset = generate_dataset(num_trajectories=100, seq_length=50, seed=42)
+
+        assert 'states' in dataset and 'actions' in dataset
+        assert dataset['states'].shape == (100, 50, 10)
+        assert dataset['actions'].shape == (100, 50)
+
+    def test_dataloader_creation(self):
+        """Test that dataloaders are created with correct batch sizes."""
+        dataset = generate_dataset(num_trajectories=100, seq_length=50, seed=42)
+        train_loader, val_loader = create_dataloaders(dataset, batch_size=16, train_split=0.8)
+
+        states_batch, actions_batch = next(iter(train_loader))
+
+        assert states_batch.shape[0] == 16, f"Expected batch size 16, got {states_batch.shape[0]}"
+        assert states_batch.shape[1:] == (50, 10)
+        assert actions_batch.shape == (16, 50)
+
+        # Verify we have batches
+        assert len(train_loader) > 0, "Train loader is empty"
+        assert len(val_loader) > 0, "Val loader is empty"
+
+
+class TestRotaryPositionalEmbedding:
+    """Test suite for Rotary Position Embedding (provided component)."""
+
+    def test_rope_preserves_shape(self):
+        """Verify that RoPE preserves tensor shapes."""
+        batch_size = 2
+        num_heads = 4
+        seq_len = 10
+        head_dim = 32
+
+        rope = RotaryPositionalEmbedding(dim=head_dim, max_seq_len=128)
+
+        q = torch.randn(batch_size, num_heads, seq_len, head_dim)
+        k = torch.randn(batch_size, num_heads, seq_len, head_dim)
+
+        q_rot, k_rot = rope(q, k)
+
+        assert q_rot.shape == q.shape, f"Q shape changed: {q.shape} -> {q_rot.shape}"
+        assert k_rot.shape == k.shape, f"K shape changed: {k.shape} -> {k_rot.shape}"
+
+    def test_rope_modifies_values(self):
+        """Verify that RoPE actually rotates the embeddings."""
+        batch_size = 2
+        num_heads = 4
+        seq_len = 10
+        head_dim = 32
+
+        rope = RotaryPositionalEmbedding(dim=head_dim, max_seq_len=128)
+
+        q = torch.randn(batch_size, num_heads, seq_len, head_dim)
+        k = torch.randn(batch_size, num_heads, seq_len, head_dim)
+
+        q_rot, k_rot = rope(q, k)
+
+        # Verify values changed (rotation applied)
+        assert not torch.allclose(q, q_rot), "RoPE did not modify Q"
+        assert not torch.allclose(k, k_rot), "RoPE did not modify K"
+
+
+class TestFeedForward:
+    """Test suite for FeedForward layer (provided component)."""
+
+    def test_feedforward_shape_preservation(self):
+        """Verify that FeedForward preserves input shape."""
+        batch_size = 4
+        seq_len = 20
+        dim = 128
+        hidden_dim = 512
+
+        ff = FeedForward(dim=dim, hidden_dim=hidden_dim, dropout=0.0)
+
+        x = torch.randn(batch_size, seq_len, dim)
+        output = ff(x)
+
+        assert output.shape == x.shape, f"Shape mismatch: {x.shape} -> {output.shape}"
+
+    def test_feedforward_with_dropout(self):
+        """Verify that FeedForward works with dropout enabled."""
+        batch_size = 4
+        seq_len = 20
+        dim = 128
+        hidden_dim = 512
+
+        ff = FeedForward(dim=dim, hidden_dim=hidden_dim, dropout=0.1)
+        ff.eval()
+
+        x = torch.randn(batch_size, seq_len, dim)
+        output = ff(x)
+
+        assert output.shape == x.shape
+        assert not torch.isnan(output).any(), "Output contains NaNs"
+
+    def test_feedforward_parameter_count(self):
+        """Verify that FeedForward has expected number of parameters."""
+        dim = 128
+        hidden_dim = 512
+
+        ff = FeedForward(dim=dim, hidden_dim=hidden_dim, dropout=0.0)
+
+        num_params = sum(p.numel() for p in ff.parameters())
+
+        # Expected: (dim * hidden_dim) + hidden_dim + (hidden_dim * dim) + dim
+        expected = (dim * hidden_dim) + hidden_dim + (hidden_dim * dim) + dim
+
+        assert num_params == expected, f"Expected {expected} parameters, got {num_params}"
+
+
+class TestTransformerBlock:
+    """Test suite for TransformerBlock instantiation."""
+
+    def test_transformer_block_instantiation(self):
+        """Test that TransformerBlock can be instantiated."""
+        dim = 128
+        num_heads = 8
+        ff_hidden_dim = 512
+
+        block = TransformerBlock(
+            dim=dim,
+            num_heads=num_heads,
+            ff_hidden_dim=ff_hidden_dim,
+            dropout=0.0
+        )
+
+        # Verify components exist
+        assert hasattr(block, 'attention'), "TransformerBlock missing attention"
+        assert hasattr(block, 'feed_forward'), "TransformerBlock missing feed_forward"
+        assert hasattr(block, 'norm1'), "TransformerBlock missing norm1"
+        assert hasattr(block, 'norm2'), "TransformerBlock missing norm2"
+
+        # Verify parameter count is reasonable
+        num_params = sum(p.numel() for p in block.parameters())
+        assert num_params > 0, "TransformerBlock has no parameters"
+
+    def test_transformer_block_components(self):
+        """Test that TransformerBlock has correct component types."""
+        dim = 128
+        num_heads = 8
+        ff_hidden_dim = 512
+
+        block = TransformerBlock(
+            dim=dim,
+            num_heads=num_heads,
+            ff_hidden_dim=ff_hidden_dim,
+            dropout=0.0
+        )
+
+        # Check types (basic smoke test)
+        assert isinstance(block.feed_forward, FeedForward)
+        # Note: Attention is student-implemented, so we can't check its type
+
+
+class TestDecoderOnlyTransformer:
+    """Test suite for full model instantiation."""
+
+    def test_model_instantiation(self):
+        """Test that the full model can be instantiated."""
+        vocab_size = 256
+        dim = 128
+        num_layers = 2
+        num_heads = 8
+        ff_hidden_dim = 512
+        max_seq_len = 50
+
+        model = DecoderOnlyTransformer(
+            vocab_size=vocab_size,
+            dim=dim,
+            num_layers=num_layers,
+            num_heads=num_heads,
+            ff_hidden_dim=ff_hidden_dim,
+            max_seq_len=max_seq_len,
+            dropout=0.0
+        )
+
+        # Verify components
+        assert hasattr(model, 'token_embedding')
+        assert hasattr(model, 'blocks')
+        assert hasattr(model, 'norm_final')
+        assert hasattr(model, 'lm_head')
+
+        assert model.token_embedding.num_embeddings == vocab_size
+        assert model.token_embedding.embedding_dim == dim
+        assert len(model.blocks) == num_layers
+        assert model.lm_head.out_features == vocab_size
+
+    def test_model_parameter_count(self):
+        """Test that the model has a reasonable number of parameters."""
+        vocab_size = 256
+        dim = 128
+        num_layers = 2
+        num_heads = 8
+        ff_hidden_dim = 512
+
+        model = DecoderOnlyTransformer(
+            vocab_size=vocab_size,
+            dim=dim,
+            num_layers=num_layers,
+            num_heads=num_heads,
+            ff_hidden_dim=ff_hidden_dim,
+            dropout=0.0
+        )
+
+        num_params = sum(p.numel() for p in model.parameters())
+
+        # Model should have at least vocab_size * dim parameters (just for embedding)
+        min_expected = vocab_size * dim
+        assert num_params >= min_expected, f"Model has only {num_params} parameters, expected at least {min_expected}"
+
+
+class TestExpectedShapes:
+    """Test expected tensor shapes through the pipeline."""
+
+    def test_input_shapes(self):
+        """Test that input shapes are as expected."""
+        batch_size = 4
+        seq_len = 50
+        vocab_size = 256
+
+        input_ids = torch.randint(0, vocab_size, (batch_size, seq_len))
+        targets = torch.randint(0, vocab_size, (batch_size, seq_len))
+
+        assert input_ids.shape == (batch_size, seq_len)
+        assert targets.shape == (batch_size, seq_len)
+
+    def test_expected_flow(self):
+        """Document expected tensor flow through the model."""
+        batch_size = 4
+        seq_len = 50
+        vocab_size = 256
+        dim = 128
+
+        # This is a documentation test - just verify our expectations
+        assert True, "Shape flow: (B, L) -> (B, L, D) -> (B, L, D) -> (B, L, V)"