harvard-edge
diff --git a/‎binder/postBuild‎
Lines changed: 3 additions & 1 deletion b/‎binder/postBuild‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎tinytorch/README.md‎
Lines changed: 1 addition & 1 deletion b/‎tinytorch/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tinytorch/binder/postBuild‎
Lines changed: 3 additions & 1 deletion b/‎tinytorch/binder/postBuild‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎tinytorch/site/Makefile‎
Lines changed: 1 addition & 1 deletion b/‎tinytorch/site/Makefile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tinytorch/site/_config_pdf.yml‎
Lines changed: 30 additions & 5 deletions b/‎tinytorch/site/_config_pdf.yml‎
Lines changed: 30 additions & 5 deletions
diff --git a/‎tinytorch/site/getting-started.md‎
Lines changed: 2 additions & 2 deletions b/‎tinytorch/site/getting-started.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tinytorch/site/tito/modules.md‎
Lines changed: 3 additions & 3 deletions b/‎tinytorch/site/tito/modules.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎tinytorch/site/tito/troubleshooting.md‎
Lines changed: 3 additions & 3 deletions b/‎tinytorch/site/tito/troubleshooting.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎tinytorch/src/01_tensor/ABOUT.md‎
Lines changed: 56 additions & 8 deletions b/‎tinytorch/src/01_tensor/ABOUT.md‎
Lines changed: 56 additions & 8 deletions
diff --git a/‎tinytorch/src/02_activations/ABOUT.md‎
Lines changed: 45 additions & 8 deletions b/‎tinytorch/src/02_activations/ABOUT.md‎
Lines changed: 45 additions & 8 deletions
@@ -18,14 +18,16 @@ echo "📓 Generating student notebooks from source..."
 for module_dir in src/*/; do
     module_name=$(basename "$module_dir")
     py_file="$module_dir/${module_name}.py"
+    # Strip numeric prefix for notebook name (e.g., "01_tensor" -> "tensor")
+    short_name="${module_name#*_}"
 
     if [ -f "$py_file" ]; then
         # Create output directory
         mkdir -p "modules/$module_name"
 
         # Convert .py to .ipynb using jupytext
         echo "   📝 Converting $module_name..."
-        jupytext --to notebook "$py_file" --output "modules/$module_name/${module_name}.ipynb" 2>/dev/null || {
+        jupytext --to notebook "$py_file" --output "modules/$module_name/${short_name}.ipynb" 2>/dev/null || {
             echo "   ⚠️  Warning: Could not convert $module_name"
         }
     fi
 
@@ -193,7 +193,7 @@ TinyTorch/
 │
 ├── modules/                    # 📓 Generated notebooks (learners work here)
 │   ├── 01_tensor/              # Auto-generated from src/
-│   │   ├── 01_tensor.ipynb     # Jupyter notebook for learning
+│   │   ├── tensor.ipynb         # Jupyter notebook for learning
 │   │   ├── README.md           # Practical implementation guide
 │   │   └── tensor.py           # Your implementation
 │   └── ...                     # (20 module directories)
 
@@ -15,14 +15,16 @@ echo "📓 Generating student notebooks from source..."
 for module_dir in src/*/; do
     module_name=$(basename "$module_dir")
     py_file="$module_dir/${module_name}.py"
+    # Strip numeric prefix for notebook name (e.g., "01_tensor" -> "tensor")
+    short_name="${module_name#*_}"
 
     if [ -f "$py_file" ]; then
         # Create output directory
         mkdir -p "modules/$module_name"
 
         # Convert .py to .ipynb using jupytext
         echo "   📝 Converting $module_name..."
-        jupytext --to notebook "$py_file" --output "modules/$module_name/${module_name}.ipynb" 2>/dev/null || {
+        jupytext --to notebook "$py_file" --output "modules/$module_name/${short_name}.ipynb" 2>/dev/null || {
             echo "   ⚠️  Warning: Could not convert $module_name"
         }
     fi
 
@@ -66,7 +66,7 @@ clean:
 install:
 	@echo "📦 Installing dependencies..."
 	pip install -U pip
-	pip install "jupyter-book<1.0"
+	pip install "jupyter-book>=1.0.0,<2.0.0"
 	pip install -r requirements.txt
 
 test:
 
@@ -13,10 +13,10 @@ description: >-
   Learn by implementing your own PyTorch-style framework with hands-on coding,
   real datasets, and production-ready practices.
 
-# Execution settings - disable for PDF
+# Execution settings - cache mode enables {glue} computed values in ABOUT.md files
 execute:
-  execute_notebooks: "off"
-  allow_errors: false
+  execute_notebooks: "cache"
+  allow_errors: true
   timeout: 300
 
 # Exclude patterns
@@ -57,8 +57,9 @@ sphinx:
     # --pdfFit scales PDF to fit the diagram (not full page)
     # --scale 1.0 keeps diagrams at natural size (1.5 was too large for tall diagrams)
     mermaid_output_format: "pdf"
-    # Width 800 constrains diagram width; scale must be integer (1 = natural size)
-    mermaid_params: ['--pdfFit', '--scale', '1', '--width', '800', '--backgroundColor', 'white']
+    # Width 600 constrains diagram viewport; scale 1 = natural size
+    # Smaller viewport + pdfcrop produces tighter diagrams that don't stretch to full page width
+    mermaid_params: ['--pdfFit', '--scale', '1', '--width', '600', '--backgroundColor', 'white']
     # Use pdfcrop to trim whitespace from mermaid PDFs
     mermaid_pdfcrop: "pdfcrop"
     # Use professional sans-serif font for mermaid diagrams to match document
@@ -91,6 +92,9 @@ sphinx:
       papersize: 'letterpaper'
       pointsize: '10pt'
       figure_align: 'H'
+      # Pass 'export' option to adjustbox before Sphinx loads it (avoids option clash).
+      # This enables max width/height keys in \includegraphics for mermaid figure capping.
+      passoptionstopackages: '\PassOptionsToPackage{export}{adjustbox}'
       fontpkg: |
         % Professional academic font stack (TeX Gyre - available in TeX Live)
         \usepackage{fontspec}
@@ -111,6 +115,27 @@ sphinx:
         \usepackage{hyperref}
         \usepackage{float}
 
+        % Cap Mermaid diagram width at 75% of text width.
+        % sphinxcontrib-mermaid hardcodes width=\linewidth for all diagrams,
+        % which stretches small flowcharts to full page width. This override
+        % intercepts \includegraphics and uses adjustbox's max width for
+        % mermaid-*.pdf files while passing other images through unchanged.
+        % Note: adjustbox 'export' option passed via passoptionstopackages above.
+        \let\OrigIncludeGraphics\includegraphics
+        \makeatletter
+        \renewcommand{\includegraphics}[2][]{%
+          \begingroup
+          \def\@mermaidtest{mermaid-}%
+          \@expandtwoargs\in@{\@mermaidtest}{#2}%
+          \ifin@
+            \OrigIncludeGraphics[max width=0.75\linewidth,max height=0.45\textheight,keepaspectratio]{#2}%
+          \else
+            \OrigIncludeGraphics[#1]{#2}%
+          \fi
+          \endgroup
+        }
+        \makeatother
+
         % Better figure placement - keep figures inline with text
         \renewcommand{\topfraction}{0.9}
         \renewcommand{\bottomfraction}{0.9}
 
@@ -104,10 +104,10 @@ This opens the module notebook and tracks your progress.
 
 ### Work in the notebook
 
-Edit `modules/01_tensor/01_tensor.ipynb` in Jupyter:
+Edit `modules/01_tensor/tensor.ipynb` in Jupyter:
 
 ```bash
-jupyter lab modules/01_tensor/01_tensor.ipynb
+jupyter lab modules/01_tensor/tensor.ipynb
 ```
 
 You'll implement:
 
@@ -409,11 +409,11 @@ src/ ← Developer source code
 
 modules/ ← Generated notebooks (students use)
 ├── 01_tensor/
-│ └── 01_tensor.ipynb ← AUTO-GENERATED for students
+│ └── tensor.ipynb ← AUTO-GENERATED for students
 ├── 02_activations/
-│ └── 02_activations.ipynb ← AUTO-GENERATED for students
+│ └── activations.ipynb ← AUTO-GENERATED for students
 └── 03_layers/
- └── 03_layers.ipynb ← AUTO-GENERATED for students
+ └── layers.ipynb ← AUTO-GENERATED for students
 ```
 
 ### Where Code Exports
 
@@ -455,19 +455,19 @@ File → Save File (or Cmd/Ctrl + S)
 
 **Step 2: Check file permissions**:
 ```bash
-ls -la modules/01_tensor/01_tensor.ipynb
+ls -la modules/01_tensor/tensor.ipynb
 # Should be writable (not read-only)
 ```
 
 **Step 3: If read-only, fix permissions**:
 ```bash
-chmod u+w modules/01_tensor/01_tensor.ipynb
+chmod u+w modules/01_tensor/tensor.ipynb
 ```
 
 **Step 4: Verify changes saved**:
 ```bash
 # Check the notebook was updated
-ls -l modules/01_tensor/01_tensor.ipynb
+ls -l modules/01_tensor/tensor.ipynb
 ```
 
 </div>
 
@@ -1,3 +1,9 @@
+---
+file_format: mystnb
+kernelspec:
+  name: python3
+---
+
 # Module 01: Tensor
 
 :::{admonition} Module Info
@@ -30,7 +36,7 @@ Listen to an AI-generated overview.
 
 Run interactively in your browser.
 
-<a href="https://mybinder.org/v2/gh/harvard-edge/cs249r_book/main?labpath=tinytorch%2Fmodules%2F01_tensor%2F01_tensor.ipynb" target="_blank" style="display: flex; align-items: center; justify-content: center; width: 100%; height: 54px; margin-top: auto; background: #f97316; color: white; text-align: center; text-decoration: none; border-radius: 27px; font-size: 14px; box-sizing: border-box;">Open in Binder →</a>
+<a href="https://mybinder.org/v2/gh/harvard-edge/cs249r_book/main?labpath=tinytorch%2Fmodules%2F01_tensor%2Ftensor.ipynb" target="_blank" style="display: flex; align-items: center; justify-content: center; width: 100%; height: 54px; margin-top: auto; background: #f97316; color: white; text-align: center; text-decoration: none; border-radius: 27px; font-size: 14px; box-sizing: border-box;">Open in Binder →</a>
 ```
 
 ```{grid-item-card} 📄 View Source
@@ -502,7 +508,20 @@ The rules are simpler than they look. Compare shapes from right to left. At each
 | `(3, 4)` | `(3,)` | Error | ✗ (3 ≠ 4) |
 | `(2, 3, 4)` | `(3, 4)` | `(2, 3, 4)` | ✓ |
 
-The memory savings are dramatic. Adding a `(768,)` vector to a `(32, 512, 768)` tensor would require copying the vector 32×512 times without broadcasting, allocating 50 MB of redundant data (12.5 million float32 numbers). With broadcasting, you store just the original 3 KB vector.
+```{code-cell} python3
+:tags: [remove-input, remove-output]
+from myst_nb import glue
+
+# Broadcasting memory comparison
+broadcast_full_elements = 32 * 512 * 768
+broadcast_full_bytes = broadcast_full_elements * 4
+broadcast_vec_bytes = 768 * 4
+glue("bcast_mb", f"{broadcast_full_bytes / 1024**2:.0f} MB")
+glue("bcast_elements", f"{broadcast_full_elements / 1e6:.1f} million")
+glue("bcast_vec_kb", f"{broadcast_vec_bytes / 1024:.0f} KB")
+```
+
+The memory savings are dramatic. Adding a `(768,)` vector to a `(32, 512, 768)` tensor would require copying the vector 32×512 times without broadcasting, allocating {glue:text}`bcast_mb` of redundant data ({glue:text}`bcast_elements` float32 numbers). With broadcasting, you store just the original {glue:text}`bcast_vec_kb` vector.
 
 ### Views vs. Copies
 
@@ -803,16 +822,45 @@ Broadcasting rules, shape semantics, and API design patterns. When you debug PyT
 
 ### Why Tensors Matter at Scale
 
+```{code-cell} python3
+:tags: [remove-input, remove-output]
+
+# LLM parameter storage (fp16 = 2 bytes per param)
+llm_params = 175_000_000_000
+llm_bytes = llm_params * 2
+glue("llm_gb", f"{llm_bytes / 1024**3:.0f} GB")
+
+# Batch of images (float32)
+batch_128_bytes = 128 * 3 * 224 * 224 * 4
+glue("batch128_mb", f"{batch_128_bytes / 1024**2:.1f} MB")
+```
+
 To appreciate why tensor operations matter, consider the scale of modern ML systems:
 
-- **Large language models**: 175 billion numbers stored as tensors = **350 GB** (like storing 70,000 full-resolution photos)
-- **Image processing**: A batch of 128 images = **77 MB** of tensor data
+- **Large language models**: 175 billion numbers stored as tensors = **{glue:text}`llm_gb`** (like storing 70,000 full-resolution photos)
+- **Image processing**: A batch of 128 images = **{glue:text}`batch128_mb`** of tensor data
 - **Self-driving cars**: Process tensor operations at **36 FPS** across multiple cameras (each frame = millions of operations in 28 milliseconds)
 
 A single matrix multiplication can consume **90% of computation time** in neural networks. Understanding tensor operations isn't just academic; it's essential for building and debugging real ML systems.
 
 ## Check Your Understanding
 
+```{code-cell} python3
+:tags: [remove-input, remove-output]
+
+# Q1: Batch memory
+q1_bytes = 32 * 3 * 224 * 224 * 4
+glue("q1_bytes", f"{q1_bytes:,}")
+glue("q1_mb", f"{q1_bytes / 1024**2:.1f} MB")
+
+# Q2: Broadcasting
+q2_full_bytes = 32 * 512 * 768 * 4
+q2_vec_bytes = 768 * 4
+glue("q2_full_mb", f"{q2_full_bytes / 1024**2:.1f} MB")
+glue("q2_vec_kb", f"{q2_vec_bytes / 1024:.0f} KB")
+glue("q2_savings_mb", f"~{q2_full_bytes / 1024**2:.0f} MB")
+```
+
 Test yourself with these systems thinking questions. They're designed to build intuition for the performance characteristics you'll encounter in production ML.
 
 **Q1: Memory Calculation**
@@ -822,7 +870,7 @@ A batch of 32 RGB images (224×224 pixels) stored as float32. How much memory?
 ```{admonition} Answer
 :class: dropdown
 
-32 × 3 × 224 × 224 × 4 = **19,267,584 bytes ≈ 19.3 MB**
+32 × 3 × 224 × 224 × 4 = **{glue:text}`q1_bytes` bytes ≈ {glue:text}`q1_mb`**
 
 This is why batch size matters - double the batch, double the memory!
 ```
@@ -834,11 +882,11 @@ Adding a vector `(768,)` to a 3D tensor `(32, 512, 768)`. How much memory does b
 ```{admonition} Answer
 :class: dropdown
 
-Without broadcasting: 32 × 512 × 768 × 4 = **50.3 MB**
+Without broadcasting: 32 × 512 × 768 × 4 = **{glue:text}`q2_full_mb`**
 
-With broadcasting: 768 × 4 = **3 KB**
+With broadcasting: 768 × 4 = **{glue:text}`q2_vec_kb`**
 
-Savings: **~50 MB per operation** - this adds up across hundreds of operations in a neural network!
+Savings: **{glue:text}`q2_savings_mb` per operation** - this adds up across hundreds of operations in a neural network!
 ```
 
 **Q3: Matmul Scaling**
 
@@ -1,3 +1,9 @@
+---
+file_format: mystnb
+kernelspec:
+  name: python3
+---
+
 # Module 02: Activations
 
 :::{admonition} Module Info
@@ -30,7 +36,7 @@ Listen to an AI-generated overview.
 
 Run interactively in your browser.
 
-<a href="https://mybinder.org/v2/gh/harvard-edge/cs249r_book/main?labpath=tinytorch%2Fmodules%2F02_activations%2F02_activations.ipynb" target="_blank" style="display: flex; align-items: center; justify-content: center; width: 100%; height: 54px; margin-top: auto; background: #f97316; color: white; text-align: center; text-decoration: none; border-radius: 27px; font-size: 14px; box-sizing: border-box;">Open in Binder →</a>
+<a href="https://mybinder.org/v2/gh/harvard-edge/cs249r_book/main?labpath=tinytorch%2Fmodules%2F02_activations%2Factivations.ipynb" target="_blank" style="display: flex; align-items: center; justify-content: center; width: 100%; height: 54px; margin-top: auto; background: #f97316; color: white; text-align: center; text-decoration: none; border-radius: 27px; font-size: 14px; box-sizing: border-box;">Open in Binder →</a>
 ```
 
 ```{grid-item-card} 📄 View Source
@@ -693,16 +699,47 @@ Let's walk through the key similarities and differences:
 Mathematical functions, numerical stability techniques (max subtraction in softmax), and the concept of element-wise transformations. When you debug PyTorch activation issues, you'll understand exactly what's happening because you implemented the same logic.
 ```
 
+```{code-cell} python3
+:tags: [remove-input, remove-output]
+from myst_nb import glue
+
+# Prose: "Why Activations Matter at Scale"
+prose_gelu_ops = 96 * 2
+glue("prose_gelu_ops", f"{prose_gelu_ops:,}")
+
+prose_daily_activations = 1000 * 86400
+glue("prose_daily_activations", f"{prose_daily_activations / 1e6:.0f} million")
+```
+
 ### Why Activations Matter at Scale
 
 To appreciate why activation choice matters, consider the scale of modern ML systems:
 
-- **Large language models**: GPT-3 has 96 transformer layers, each with 2 GELU activations. That's **192 GELU operations per forward pass** on billions of parameters.
+- **Large language models**: GPT-3 has 96 transformer layers, each with 2 GELU activations. That's **{glue:text}`prose_gelu_ops` GELU operations per forward pass** on billions of parameters.
 - **Image classification**: ResNet-50 has 49 convolutional layers, each followed by ReLU. Processing a batch of 256 images at 224×224 resolution means **12 billion ReLU operations** per batch.
-- **Production serving**: A model serving 1000 requests per second performs **86 million activation computations per day**. A 20% speedup from ReLU vs GELU saves hours of compute time.
+- **Production serving**: A model serving 1000 requests per second performs **{glue:text}`prose_daily_activations` activation computations per day**. A 20% speedup from ReLU vs GELU saves hours of compute time.
 
 Activation functions account for **5-15% of total training time** in typical networks (the rest is matrix multiplication). But in transformer models with many layers and small matrix sizes, activations can account for **20-30% of compute time**. This is why GELU vs ReLU is a real trade-off: slower computation but potentially better accuracy.
 
+```{code-cell} python3
+:tags: [remove-input, remove-output]
+from myst_nb import glue
+
+# Q1: Memory calculation
+q1_bytes = 32 * 4096 * 4
+glue("q1_bytes", f"{q1_bytes:,}")
+glue("q1_kb", f"{q1_bytes / 1024:.0f} KB")
+
+q1_100layer_kb = 100 * (q1_bytes / 1024)
+glue("q1_100layer_mb", f"{q1_100layer_kb / 1024:.0f} MB")
+
+# Q4: Sparsity analysis
+q4_total = 128 * 1024
+q4_zeros = q4_total // 2
+glue("q4_total", f"{q4_total:,}")
+glue("q4_zeros", f"≈ {q4_zeros:,}")
+```
+
 ## Check Your Understanding
 
 Test yourself with these systems thinking questions. They're designed to build intuition for how activations behave in real neural networks.
@@ -714,9 +751,9 @@ A batch of 32 samples passes through a hidden layer with 4096 neurons and ReLU a
 ```{admonition} Answer
 :class: dropdown
 
-32 × 4096 × 4 bytes = **524,288 bytes ≈ 512 KB**
+32 × 4096 × 4 bytes = **{glue:text}`q1_bytes` bytes ≈ {glue:text}`q1_kb`**
 
-This is the activation memory for ONE layer. A 100-layer network needs 50 MB just to store activations for one forward pass. This is why activation memory dominates training memory usage — activations must be cached for backpropagation.
+This is the activation memory for ONE layer. A 100-layer network needs {glue:text}`q1_100layer_mb` just to store activations for one forward pass. This is why activation memory dominates training memory usage — activations must be cached for backpropagation.
 ```
 
 **Q2: Computational Cost**
@@ -764,8 +801,8 @@ For a standard normal distribution N(0, 1), approximately **50% of values are ne
 
 ReLU zeros all negative values, so approximately **50% of outputs will be exactly zero**.
 
-Total elements: 128 × 1024 = 131,072
-Zeros: ≈ 65,536
+Total elements: 128 × 1024 = {glue:text}`q4_total`
+Zeros: {glue:text}`q4_zeros`
 
 This sparsity has major implications:
 - **Speed**: Multiplying by zero is free, so downstream computations can skip ~50% of operations
@@ -839,7 +876,7 @@ Implement Linear layers that combine your Tensor operations with your activation
 
 ```{tip} Interactive Options
 
-- **[Launch Binder](https://mybinder.org/v2/gh/harvard-edge/cs249r_book/main?urlpath=lab/tree/tinytorch/modules/02_activations/02_activations.ipynb)** - Run interactively in browser, no setup required
+- **[Launch Binder](https://mybinder.org/v2/gh/harvard-edge/cs249r_book/main?urlpath=lab/tree/tinytorch/modules/02_activations/activations.ipynb)** - Run interactively in browser, no setup required
 - **[View Source](https://github.com/harvard-edge/cs249r_book/blob/main/tinytorch/src/02_activations/02_activations.py)** - Browse the implementation code
 ```