-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathindex.html
More file actions
290 lines (280 loc) · 17 KB
/
index.html
File metadata and controls
290 lines (280 loc) · 17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<title>Nexus — Expert‑Sliced GPU Scheduling</title>
<meta name="description" content="Nexus: Expert‑Sliced GPU Scheduling for Mixture‑of‑Experts (MoE). Triton routing kernels, CUDA Graphs, dynamic GPU slices, stream parallelism, and energy telemetry."/>
<meta name="color-scheme" content="dark light"/>
<style>
:root{
--bg:#0b1020;--bg2:#0e1530;--card:#111a34;--ink:#e8eefc;--muted:#a9b6d9;--line:#2548c9;--pri:#5b77ff;--sec:#36c6ff;--acc:#21d19f;
--shadow:0 10px 30px rgba(0,0,0,.35);
--radius:14px;--radius-sm:10px;--radius-lg:20px;--maxw:1400px;
--mono: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
--sans: Inter, ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, "Apple Color Emoji", "Segoe UI Emoji";
}
@media(prefers-color-scheme:light){
:root{--bg:#f7f9ff;--bg2:#ecf1ff;--card:#ffffff;--ink:#0e1530;--muted:#4c5a7a;--line:#cbd5ff;--pri:#2746ff;--sec:#0aa4e3;--acc:#15a87f;--shadow:0 10px 24px rgba(41,59,125,.15)}
}
*{box-sizing:border-box}
html,body{margin:0;height:100%;scroll-behavior:smooth}
body{font-family:var(--sans);background:radial-gradient(1200px 600px at 20% -10%, rgba(91,119,255,.15), transparent 60%),
radial-gradient(900px 500px at 120% 10%, rgba(33,209,159,.12), transparent 60%), var(--bg);color:var(--ink)}
a{color:var(--sec);text-decoration:none}
a:hover{text-decoration:underline}
.wrap{max-width:var(--maxw);margin:0 auto;padding:24px}
header{position:sticky;top:0;backdrop-filter:saturate(140%) blur(8px);background:color-mix(in hsl, var(--bg) 60%, transparent);z-index:50;border-bottom:1px solid color-mix(in hsl, var(--line) 60%, transparent)}
.row{display:flex;gap:16px;align-items:center;justify-content:space-between}
.brand{display:flex;gap:10px;align-items:center;font-weight:800;letter-spacing:.2px}
.brand .dot{width:10px;height:10px;border-radius:999px;background:linear-gradient(135deg,var(--pri),var(--sec))}
nav a{margin:0 10px;font-size:14px;opacity:.9}
.hero{display:grid;grid-template-columns:1.15fr .85fr;gap:28px;align-items:center;padding:40px 0}
.h1{font-size:38px;line-height:1.15;margin:0 0 8px}
.lead{font-size:16px;color:var(--muted);max-width:60ch}
.cta{display:flex;gap:12px;margin-top:16px}
.btn{display:inline-flex;align-items:center;gap:8px;padding:12px 16px;border-radius:12px;border:1px solid color-mix(in hsl, var(--pri) 50%, var(--line));background:linear-gradient(180deg, color-mix(in hsl, var(--card) 90%, rgba(255,255,255,.08)), var(--card));box-shadow:var(--shadow);font-weight:600}
.btn:hover{transform:translateY(-1px)}
.card{background:var(--bg2);border:1px solid color-mix(in hsl, var(--line) 70%, transparent);box-shadow:var(--shadow);border-radius:var(--radius);padding:16px}
.grid{display:grid;grid-template-columns:repeat(3,1fr);gap:16px}
.kdb{font-family:var(--mono);font-size:13px;padding:6px 8px;border-radius:8px;background:color-mix(in hsl, var(--bg2) 90%, #000);border:1px solid color-mix(in hsl, var(--line) 60%, transparent)}
h2{margin:34px 0 14px;font-size:24px}
h3{margin:24px 0 10px;font-size:18px}
pre,code{font-family:var(--mono);font-size:13px}
pre{background:color-mix(in hsl, var(--bg) 85%, #000);padding:14px;border-radius:12px;border:1px solid color-mix(in hsl, var(--line) 60%, transparent);overflow:auto}
.two{display:grid;grid-template-columns:0.8fr 1.2fr;gap:18px}
.arch{background:var(--bg2);border-radius:var(--radius);padding:8px;border:1px solid color-mix(in hsl, var(--line) 70%, transparent)}
footer{opacity:.75;padding:28px 0 40px;font-size:13px}
.pill{display:inline-flex;align-items:center;gap:6px;padding:6px 10px;border-radius:999px;border:1px solid color-mix(in hsl, var(--line) 70%, transparent);background:color-mix(in hsl, var(--bg2) 92%, #000);font-size:12px}
.badges{display:flex;flex-wrap:wrap;gap:8px;margin:12px 0}
.note{font-size:13px;color:var(--muted)}
@media(max-width:980px){.hero{grid-template-columns:1fr}.grid{grid-template-columns:1fr}.two{grid-template-columns:1fr}}
</style>
</head>
<body>
<header>
<div class="wrap row" role="navigation" aria-label="Primary">
<div class="brand"><span class="dot"></span><span>Nexus</span></div>
<nav>
<a href="#features">Features</a>
<a href="#architecture">Architecture</a>
<a href="#quickstart">Quickstart</a>
<a href="#bench">Benchmarks</a>
<a href="#cite">Cite</a>
<a href="https://github.com/Esmail-ibraheem/Nexus" target="_blank" rel="noreferrer">GitHub ↗</a>
</nav>
</div>
</header>
<main class="wrap">
<section class="hero" aria-label="Intro">
<div>
<h1 class="h1">Expert‑Sliced GPU Scheduling for MoE</h1>
<p class="lead">Nexus orchestrates Mixture‑of‑Experts execution with <b>Triton routing kernels</b>, <b>CUDA Graph</b> replay, and <b>dynamic GPU slices</b> mapped across CUDA streams — with built‑in energy telemetry.</p>
<div class="badges">
<span class="pill">CUDA 12.x</span>
<span class="pill">Triton 2.x</span>
<span class="pill">A100/H100 Ready</span>
<span class="pill">Streams: 8 (configurable)</span>
</div>
<div class="cta">
<a class="btn" href="https://github.com/Esmail-ibraheem/Nexus" target="_blank" rel="noreferrer">★ Star on GitHub</a>
<a class="btn" href="#quickstart">Get Started</a>
</div>
</div>
<div class="card">
<div class="note">pip one‑liner</div>
<pre><code>pip install nexus
python -m nexus.demo </code></pre>
<div class="note">Or build from source:</div>
<pre><code>git clone https://github.com/Esmail-ibraheem/Nexus
cd Nexus
pip install -e .</code></pre>
</div>
</section>
<section id="features" aria-label="Features">
<h2>What Nexus Provides</h2>
<div class="grid">
<div class="card"><h3>Triton Routing Kernel</h3><p>Fuses softmax → top‑k → atomic expert counts in a single pass to reduce memory traffic and create expert token buckets.</p></div>
<div class="card"><h3>Expert Profiler</h3><p>Tracks per‑expert usage with a rolling window to distinguish hot/warm/cold experts for better placement.</p></div>
<div class="card"><h3>GPU Slice Manager</h3><p>Dynamically assigns SM/BW/cache budget slices; integrates with MIG partitions when available.</p></div>
<div class="card"><h3>CUDA Graph Manager</h3><p>Captures warm‑started execution patterns and replays them to minimize kernel launch overhead.</p></div>
<div class="card"><h3>Stream Scheduler</h3><p>Maps experts to CUDA streams (N configurable) for parallel execution across slices.</p></div>
<div class="card"><h3>Energy Monitor</h3><p>NVML‑based power telemetry; reports tokens/J and feeds the profiler for energy‑aware scheduling.</p></div>
</div>
</section>
<section id="architecture" class="two" aria-label="Architecture">
<div>
<h2>Architecture Overview</h2>
<p class="note">Data‑flow vs control‑flow are separated; slices can be resized or migrated at runtime.</p>
<ul>
<li>Data plane: Token ingress → <b>Triton Routing</b> → Expert kernels → Weighted aggregation → MoE output</li>
<li>Control plane: <b>Expert Profiler</b>, <b>GPU Slice Manager</b>, <b>CUDA Graph Manager</b>, <b>Stream Scheduler</b></li>
<li>Execution plane: Dynamic GPU slices (Hot/Warm/Cold, Aux, Router‑Backward, optional MIG)</li>
</ul>
</div>
<div class="arch" aria-label="Architecture diagram">
<!-- Inline SVG (same as we exported) -->
<svg viewBox="0 0 1200 820" role="img" aria-label="Architecture diagram for Nexus" style="width:100%;height:auto;display:block;border-radius:12px;max-width:100%;min-height:720px">
<defs>
<marker id="arrowhead" markerWidth="8" markerHeight="6" refX="7" refY="3" orient="auto" markerUnits="strokeWidth"><path d="M0,0 L8,3 L0,6 Z" fill="#9dd0ff"/></marker>
<marker id="arrowhead2" markerWidth="8" markerHeight="6" refX="7" refY="3" orient="auto" markerUnits="strokeWidth"><path d="M0,0 L8,3 L0,6 Z" fill="#21d19f"/></marker>
</defs>
<style>
.box{fill:#131c3f;stroke:#5b77ff;stroke-width:1.2;rx:10;ry:10}
.box2{fill:#111a34;stroke:#36c6ff;stroke-width:1.1;rx:10;ry:10}
.box3{fill:#131d2c;stroke:#21d19f;stroke-width:1.1;rx:10;ry:10}
.gpu{fill:#0f1a2e;stroke:#9aa7ff;stroke-dasharray:6 4;rx:14;ry:14}
.slice{fill:#16234b;stroke:#9aa7ff;stroke-width:1}
.titleText{fill:#eaf0ff;font-weight:700;font-size:12px;font-family:var(--sans)}
.t{fill:#cfe2ff;font-size:11px;font-family:var(--sans)}
.small{fill:#cfe2ff;font-size:10px;font-family:var(--sans)}
.arrow{stroke:#9dd0ff;stroke-width:1.6;marker-end:url(#arrowhead)}
.arrow2{stroke:#21d19f;stroke-width:1.6;marker-end:url(#arrowhead2)}
</style>
<rect class="box" x="30" y="60" width="250" height="120"/>
<text class="titleText" x="50" y="85">Token Ingress & Gating</text>
<text class="t" x="50" y="110">• Inputs (tokens, shapes)</text>
<text class="t" x="50" y="130">• Router logits</text>
<text class="small" x="50" y="150">(k‑of‑N top‑k selection)</text>
<rect class="box2" x="30" y="210" width="250" height="150"/>
<text class="titleText" x="50" y="235">Triton Routing Kernel</text>
<text class="t" x="50" y="260">softmax ⟶ top‑k ⟶ atomic expert counts</text>
<text class="small" x="50" y="280">single‑pass, no intermed. tensors</text>
<text class="small" x="50" y="300">3× lower memory traffic</text>
<line class="arrow" x1="280" y1="285" x2="380" y2="285"/>
<rect class="gpu" x="380" y="40" width="790" height="740"/>
<text class="titleText" x="400" y="65">A100 / H100 GPU Complex</text>
<text class="small" x="400" y="83">HBM, SMs, L2; MIG optional</text>
<rect class="box2" x="410" y="110" width="250" height="110"/>
<text class="titleText" x="430" y="135">Expert Profiler</text>
<text class="t" x="430" y="160">runtime usage (ui tokens/s)</text>
<text class="small" x="430" y="178">rolling window; hot/cold experts</text>
<rect class="box2" x="680" y="110" width="250" height="110"/>
<text class="titleText" x="700" y="135">GPU Slice Manager</text>
<text class="t" x="700" y="160">dynamic/proportional allocation</text>
<text class="small" x="700" y="178">evict/assign SM & BW slices</text>
<rect class="box2" x="950" y="110" width="200" height="110"/>
<text class="titleText" x="970" y="135">CUDA Graph Manager</text>
<text class="small" x="970" y="160">warm‑up ⟶ capture ⟶ replay</text>
<text class="small" x="970" y="178">minimize launch overhead</text>
<line class="arrow2" x1="660" y1="165" x2="680" y2="165"/>
<line class="arrow2" x1="930" y1="165" x2="950" y2="165"/>
<rect class="box3" x="410" y="250" width="300" height="90"/>
<text class="titleText" x="430" y="275">Stream Scheduler</text>
<text class="t" x="430" y="298">assign experts ⟶ CUDA streams (N=8)</text>
<line class="arrow2" x1="535" y1="220" x2="535" y2="250"/>
<line class="arrow2" x1="805" y1="220" x2="705" y2="250"/>
<line class="arrow2" x1="1050" y1="220" x2="710" y2="250"/>
<rect class="box" x="410" y="360" width="740" height="340"/>
<text class="titleText" x="430" y="385">GPU Execution Plane — Dynamic Slices</text>
<text class="small" x="430" y="403">SM groups, BW, cache budget; MIG partitions when available</text>
<g>
<rect class="slice" x="430" y="420" width="220" height="120"/>
<text class="t" x="440" y="445">Slice A — Hot Experts</text>
<text class="small" x="440" y="465">larger SM share</text>
<rect class="box2" x="445" y="475" width="190" height="50"/>
<text class="small" x="455" y="505">Fused Expert MLP (Triton)</text>
</g>
<g>
<rect class="slice" x="670" y="420" width="220" height="120"/>
<text class="t" x="680" y="445">Slice B — Warm Experts</text>
<text class="small" x="680" y="465">medium SM share</text>
<rect class="box2" x="685" y="475" width="190" height="50"/>
<text class="small" x="695" y="505">Batched Expert Kernel</text>
</g>
<g>
<rect class="slice" x="910" y="420" width="220" height="120"/>
<text class="t" x="920" y="445">Slice C — Cold Experts</text>
<text class="small" x="920" y="465">share SMs</text>
<rect class="box2" x="925" y="475" width="190" height="50"/>
<text class="small" x="935" y="505">Time‑sliced / queued</text>
</g>
<g>
<rect class="slice" x="430" y="560" width="220" height="120"/>
<text class="t" x="440" y="585">Slice D — Aux</text>
<text class="small" x="440" y="605">aggregation / residuals</text>
</g>
<g>
<rect class="slice" x="670" y="560" width="220" height="120"/>
<text class="t" x="680" y="585">Slice E — Router Backward</text>
<text class="small" x="680" y="605">(training)</text>
</g>
<g>
<rect class="slice" x="910" y="560" width="220" height="120"/>
<text class="t" x="920" y="585">Slice F — MIG Instance</text>
<text class="small" x="920" y="605">optional A100/H100</text>
</g>
<line class="arrow" x1="610" y1="340" x2="540" y2="420"/>
<line class="arrow" x1="610" y1="340" x2="780" y2="420"/>
<line class="arrow" x1="610" y1="340" x2="1010" y2="420"/>
<rect class="box" x="410" y="720" width="300" height="40"/>
<text class="t" x="420" y="745">Expert Outputs Aggregation (weighted sum / concat)</text>
<line class="arrow" x1="710" y1="740" x2="840" y2="740"/>
<rect class="box" x="840" y="720" width="310" height="40"/>
<text class="t" x="850" y="745">MoE Layer Output ⟶ next layer / loss</text>
<rect class="box3" x="30" y="400" width="250" height="120"/>
<text class="titleText" x="50" y="425">Energy Monitor</text>
<text class="small" x="50" y="445">NVML power · tokens/J</text>
<text class="small" x="50" y="463">emits telemetry ⟶ profiler</text>
<line class="arrow2" x1="280" y1="460" x2="410" y2="460"/>
<line class="arrow2" x1="280" y1="440" x2="410" y2="180"/>
<line class="arrow" x1="155" y1="180" x2="155" y2="210"/>
<g>
<rect x="30" y="560" width="250" height="140" fill="#1b2a6b" stroke="#3b4ea0" rx="12" ry="12"/>
<text class="small" x="45" y="585" style="fill:#fff;opacity:.9">Legend</text>
<rect class="box" x="45" y="600" width="16" height="10"/>
<text class="small" x="70" y="609" style="fill:#fff;opacity:.9">Data plane / tensors</text>
<rect class="box2" x="45" y="620" width="16" height="10"/>
<text class="small" x="70" y="629" style="fill:#fff;opacity:.9">Control kernels / managers</text>
<rect class="box3" x="45" y="640" width="16" height="10"/>
<text class="small" x="70" y="649" style="fill:#fff;opacity:.9">Schedulers / telemetry</text>
<line class="arrow" x1="48" y="665" x2="62" y="665"/>
<text class="small" x="70" y="669" style="fill:#fff;opacity:.9">data‑flow</text>
<line class="arrow2" x1="48" y="685" x2="62" y="685"/>
<text class="small" x="70" y="689" style="fill:#fff;opacity:.9">control‑flow / feedback</text>
</g>
</svg>
</div>
</section>
<section id="quickstart" aria-label="Quickstart">
<h2>Quickstart</h2>
<div class="two">
<div class="card">
<h3>Install (Dev)</h3>
<pre><code>git clone https://github.com/Esmail-ibraheem/Nexus
cd Nexus
pip install -e .
python -m nexus.demo</code></pre>
</div>
<div class="card">
<h3>Minimal Usage (Pseudo)</h3>
<pre><code>from nexus import Router, SliceManager, schedule
router = Router(top_k=2)
buckets = router.route(tokens, logits)
plan = SliceManager().allocate(buckets)
schedule(plan).run()</code></pre>
</div>
</div>
</section>
<!-- <section id="bench" aria-label="Benchmarks">
<h2>Benchmarks (WIP)</h2>
<div class="card"><p>Plug in your latest numbers (tokens/s, latency, tokens/J). Add charts or a table right here.</p></div>
</section> -->
<section id="cite" aria-label="Cite">
<h2>Citation</h2>
<pre><code>@article{nexus2025,
title = {Nexus: Expert-Sliced GPU Scheduling for Mixture-of-Experts},
author = {Gumaan, Esmail},
journal = {arXiv preprint arXiv:xxxx.xxxxx},
year = {2025}
}</code></pre>
</section>
<footer>
<div class="row">
<span>© 2025 Nexus • Expert‑Sliced GPU Scheduling</span>
<span><a href="https://github.com/Esmail-ibraheem/Nexus" target="_blank" rel="noreferrer">GitHub</a></span>
</div>
</footer>
</main>
</body>
</html>