arpg · yuni-wyx · Jan 15, 2026 · Feb 4, 2026 · Feb 5, 2026 · Feb 17, 2026
diff --git a/.gitignore b/.gitignore
@@ -48,6 +48,7 @@ next-env.d.ts
 
 # python
 /venv/
+.venv/
 *.pyc
 __pycache__/
 

diff --git a/content/course/submissions/scratch-1/images/attention_last_layer_avg_heads.png b/content/course/submissions/scratch-1/images/attention_last_layer_avg_heads.png
diff --git a/content/course/submissions/scratch-1/images/attention_last_layer_head0.png b/content/course/submissions/scratch-1/images/attention_last_layer_head0.png
diff --git a/content/course/submissions/scratch-1/images/attention_layer0_avg_heads.png b/content/course/submissions/scratch-1/images/attention_layer0_avg_heads.png
diff --git a/content/course/submissions/scratch-1/images/attention_layer0_head0.png b/content/course/submissions/scratch-1/images/attention_layer0_head0.png
diff --git a/content/course/submissions/scratch-1/images/audit_loss_curve.png b/content/course/submissions/scratch-1/images/audit_loss_curve.png
diff --git a/content/course/submissions/scratch-1/yuniwu.mdx b/content/course/submissions/scratch-1/yuniwu.mdx
@@ -0,0 +1,73 @@
+---
+title: "Scratch-1 Submission: Yuni Wu"
+student: "Yuni Wu"
+date: "2026-02-03"
+---
+
+# Scratch-1: The Transformer Backbone
+
+This assignment implements a decoder-only Transformer from scratch and trains it on a synthetic robotic trajectory dataset for next-token prediction. The experiment demonstrates that a single autoregressive architecture can **learn structured robotic action sequences**
+when the data is designed to be learnable.
+
+---
+
+## Loss Curve
+
+![Training Loss (Audit: Causal vs No Mask)](./images/audit_loss_curve.png)
+
+The training loss decreases rapidly during the first few hundred steps and then gradually converges, stabilizing around a low value after several thousand iterations.
+
+This behavior indicates that:
+- The action encoding is learnable from state information.
+- The Transformer successfully captures temporal dependencies in the trajectory.
+- Optimization is stable with RMSNorm, RoPE, and gradient clipping enabled.
+
+---
+
+## Attention Visualization
+
+![Attention Maps - Layer 0 Head 0](./images/attention_layer0_head0.png)
+![Attention Maps - Last Layer Head 0](./images/attention_last_layer_head0.png)
+![Attention Maps - Layer 0 Avg.Head](./images/attention_layer0_avg_heads.png)
+![Attention Maps - Last Layer Avg.Head](./images/attention_last_layer_avg_heads.png)
+
+
+We visualize attention maps from both the **first Transformer layer (Layer 0)** and the **last Transformer layer**.
+
+- **Layer 0** primarily attends to nearby timesteps, showing strong diagonal patterns. This suggests the model is learning local, short-horizon dependencies.
+- **Last Layer** exhibits broader attention over longer temporal ranges, indicating aggregation of global trajectory-level information relevant for action prediction.
+
+This layer-wise difference reflects hierarchical representation learning, consistent with behavior observed in large language models.
+
+---
+
+## The Audit: Removing the Causal Mask
+
+When the causal mask is removed, the training loss drops significantly faster and converges to a much lower value. However, this improvement is artificial.
+
+---
+
+### Why the Model "Cheats"
+
+Without the causal mask, each token can attend to future ground-truth actions through self-attention. This causes information leakage: the model no longer learns to predict the next action from past context, but instead copies future tokens directly. As a result, the loss no longer reflects genuine predictive ability.
+
+---
+
+## Code Highlights
+
+- **Causal Self-Attention** implemented manually with a lower-triangular mask applied *before* softmax.
+- **Rotary Positional Embeddings (RoPE)** used instead of absolute positional embeddings to encode relative temporal structure.
+- **RMSNorm** applied in a pre-norm configuration for improved training stability.
+- Attention maps are explicitly stored during forward passes for visualization and analysis.
+
+---
+
+## Challenges and Solutions
+
+- **Training instability** was mitigated using gradient clipping and RMSNorm.
+- **Unlearnable action noise** was resolved by redesigning the action space into a structured 256-bin encoding (direction + magnitude).
+- **Attention visualization bugs** were fixed by properly handling step-based logs and isolating the final training run.
+
+---
+
+Overall, this assignment demonstrates that with proper architectural constraints and data design, a decoder-only Transformer can effectively model robotic control as a unified sequence modeling problem.
diff --git a/grading_reports/GRADING_REPORT.md b/grading_reports/GRADING_REPORT.md
@@ -0,0 +1,60 @@
+![Chris-Bot](~/chris_robot.png)
+### 🤖 Chris's Grading Assistant - Feedback Report
+
+**Student:** @yuni-wyx
+**PR:** #42
+**Branch:** `scratch-1-yuniwu`
+
+Hi! I've reviewed your submission. Here's what I found:
+
+---
+
+## 📊 Component Feedback
+
+### ✅ Causal Self-Attention
+
+✅ Perfect! Your causal mask correctly prevents future token leakage.
+
+✅ Test passed.
+
+### ✅ RMSNorm
+
+✅ RMSNorm implemented correctly with proper normalization and learnable scale.
+
+✅ Test passed.
+
+### ✅ Training Loop
+
+✅ Excellent! Your model trains successfully and loss converges.
+
+### ✅ RoPE Embeddings
+
+✅ RoPE correctly applied to Q and K tensors.
+
+### ✅ Model Architecture
+
+✅ Model forward pass works end-to-end with correct output shapes.
+
+✅ Model has the expected number of trainable parameters.
+
+### ❌ Code Quality
+
+✅ Code imports successfully.
+
+✅ Test passed.
+
+❌ Test failed.
+
+---
+
+## 📝 Documentation & Analysis
+
+✅ Report submitted! I found:
+- `content/course/submissions/scratch-1/yuniwu.mdx`
+- `README.md`
+
+Your instructor will review the quality of your analysis.
+
+---
+
+> *Grading is automated but reviewed by an instructor. If you have questions, reach out on Slack!*
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,16 +10,28 @@ dependencies = [
     "numpy>=1.24.0",
     "pytest>=7.0.0",
     "pytest-html>=4.0.0",
+    "matplotlib>=3.5.0",
 ]
 
+[[tool.uv.index]]
+name = "pytorch-cpu"
+url = "https://download.pytorch.org/whl/cpu"
+explicit = true
+
 [[tool.uv.index]]
 name = "pytorch-cu118"
 url = "https://download.pytorch.org/whl/cu118"
 explicit = true
 
 [tool.uv.sources]
-torch = [{ index = "pytorch-cu118" }]
-torchvision = [{ index = "pytorch-cu118" }]
+torch = [
+    { index = "pytorch-cpu", marker = "sys_platform == 'darwin'" },
+    { index = "pytorch-cu118", marker = "sys_platform == 'linux'" }
+]
+torchvision = [
+    { index = "pytorch-cpu", marker = "sys_platform == 'darwin'" },
+    { index = "pytorch-cu118", marker = "sys_platform == 'linux'" }
+]
 
 [tool.hatch.build.targets.wheel]
 packages = []
-Original file line number
+Diff line change
@@ Expand Up / @@ -48,6 +48,7 @@ next-env.d.ts @@
     # python
     /venv/
+    .venv/
     *.pyc
     __pycache__/
@@ Expand Down @@