sky-2002 · varun-ml · Oct 8, 2025 · Copilot · Oct 8, 2025
diff --git a/deepseek/README.md b/deepseek/README.md
@@ -103,4 +103,7 @@ The above plots try to answer this question -> `How much attention, on average,
 - Expert 0–1 → grammar & attribution (“and”, “said”)
 - Expert 2–3 → event & interaction semantics (“play”, “with”, “time”, “She”)
 - NOUN peaks in Expert 2 (0.18) → this expert handles object and subject nouns (like “Lily”, “time”, “girl”).
-- PUNCT high in Expert 1 → punctuation + quotation boundaries → this expert probably activates for dialogue and sentence ends.
+- PUNCT high in Expert 1 → punctuation + quotation boundaries → this expert probably activates for dialogue and sentence ends.
+
+### Model Architecture diagram
+![model_architecture](images/deepseek_arch.png)
-![model_architecture](images/deepseek_arch.png)
+```mermaid
+%% See deepseek_arch.mermaid for the source diagram
+%% Paste the contents of deepseek_arch.mermaid below
+[PASTE THE CONTENTS OF deepseek_arch.mermaid HERE]
-![model_architecture](images/deepseek_arch.png)
+```mermaid
+%% See deepseek_arch.mermaid for the source diagram
+%% Paste the contents of deepseek_arch.mermaid below
+[PASTE THE CONTENTS OF deepseek_arch.mermaid HERE]
diff --git a/deepseek/images/deepseek_arch.mermaid b/deepseek/images/deepseek_arch.mermaid
@@ -0,0 +1,41 @@
+flowchart TB
+  %% ====== Top-Level Model ======
+  subgraph Model["DeepseekInspiredModel"]
+    direction TB
+    X["Token IDs"]
+    Embed["Embedding Layer"]
+    H0["Hidden States (h₀)"]
+
+    %% --- Dense Blocks ---
+    subgraph Dense["Dense Blocks (count = num_dense_ffn = 2)"]
+      direction TB
+      DB["TransformerBlock\n→ Attention + Dense Expert FFN"]
+    end
+
+    %% --- MoE Blocks ---
+    subgraph MoE["MoE Blocks (count = num_moe_ffn = 4)"]
+      direction TB
+      MB["TransformerBlock\n→ Attention + MoE FFN"]
+    end
+
+    LN["Final RMSNorm"]
+    Head["Linear to vocab (tied weights)"]
+    Logits["Output Logits"]
+
+    X --> Embed --> H0 --> Dense --> MoE --> LN --> Head --> Logits
+  end
+
+  %% ====== MoE FFN Internals (high-level) ======
+  subgraph MoE_FFN["MoE FFN per token"]
+    direction TB
+    h["Input h"]
+    Gate["Gate: Linear → scores\n(top-k = 2)"]
+    Routed["Routed Experts (num = 16)\nOnly top-2 active per token"]
+    Shared["Shared Experts (num = 8)\nAlways active"]
+    Combine["Sum( Routed_output + Shared_output )"]
+
+    h --> Gate --> Routed --> Combine
+    h --> Shared --> Combine
+  end
+
+  MB --- MoE_FFN
diff --git a/deepseek/images/deepseek_arch.png b/deepseek/images/deepseek_arch.png