Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions lib/iris/examples/marin.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ scale_groups:
num_vms: 1
priority: 10
resources: { cpu: 112, ram: 192GB, disk: 100GB, tpu_count: 4, gpu_count: 0 }
min_slices: 0
min_slices: 1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Priority: High

Setting min_slices: 1 here, and again for tpu_v6e_4, turns these zoned groups into five permanently warm 4-chip TPU slices, not one global warm pool. load_config() preserves explicit min_slices on every zone-expanded group, and the autoscaler enforces that floor even with zero demand, so this change stops these pools from scaling back to zero. If the goal is to reduce cold-start latency, this needs a single dedicated warm pool or some other global mechanism rather than min_slices on multi-zone groups.

Recommended fix: revert these min_slices changes, or replace them with a design that keeps at most one intentional warm slice instead of one per zone-expanded group.

Generated with Codex.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this is okay and intentional, we want to keep slices warm.

max_slices: 1024
slice_template:
accelerator_variant: v5litepod-4
Expand Down Expand Up @@ -183,7 +183,7 @@ scale_groups:
num_vms: 1
priority: 10
resources: { cpu: 180, ram: 720GB, disk: 100GB, tpu_count: 4, gpu_count: 0 }
min_slices: 0
min_slices: 1
max_slices: 1024
slice_template:
accelerator_variant: v6e-4
Expand Down
120 changes: 65 additions & 55 deletions lib/iris/src/iris/cluster/static/controller/worker-detail.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import { useState, useEffect, useRef, useCallback } from 'preact/hooks';
import htm from 'htm';
import { controllerRpc } from '/static/shared/rpc.js';
import { formatBytes, formatRelativeTime, stateToName, formatDuration } from '/static/shared/utils.js';
import { MetricCard, ResourceSection, Gauge, InlineGauge, Field, Section, Sparkline, formatMbPair, formatRate } from '/static/shared/components.js';
import { MetricCard, InlineGauge, Field, Section, Sparkline, formatMbPair, formatRate } from '/static/shared/components.js';

const html = htm.bind(h);

Expand Down Expand Up @@ -175,66 +175,76 @@ function WorkerDetailApp() {
${workerHeartbeat && html`<${Field} label="Last Heartbeat" value=${workerHeartbeat} />`}
${workerCpu && html`<${Field} label="CPU Cores" value=${workerCpu} />`}
${accelDisplay && html`<${Field} label="Accelerator" value=${accelDisplay} />`}
${!liveRes && workerMem > 0 && html`<${Field} label="Memory" value=${formatBytes(workerMem)} />`}
${!liveRes && workerDisk > 0 && html`<${Field} label="Disk" value=${formatBytes(workerDisk)} />`}
</dl>
${liveRes && html`
<${ResourceSection} title="Live Utilization">
<div style="display:flex;align-items:center;gap:8px">
<div style="flex:1"><${Gauge} label="CPU" value=${liveCpuPct || 0} max=${100} format="percent" /></div>
${resourceHistory.length >= 2 && html`
<${Sparkline} values=${resourceHistory.map(s => s.cpuPercent || 0)}
max=${100} width=${80} height=${24}
color="var(--color-accent)"
fillColor="rgba(9,105,218,0.1)" />
`}
<//>
</div>

${liveRes && html`
<div class="utilization-panel">
<h2 class="utilization-panel__title">Live Utilization</h2>
<div class="utilization-panel__grid">
<div class="utilization-metric">
<div class="utilization-metric__header">
<span class="utilization-metric__label">CPU</span>
<span class="utilization-metric__value utilization-metric__value--accent">${(liveCpuPct || 0) + '%'}</span>
</div>
<div class="utilization-metric__chart">
<${Sparkline} values=${resourceHistory.map(s => s.cpuPercent || 0)}
max=${100} width=${240} height=${40}
color="var(--color-accent)"
fillColor="rgba(9,105,218,0.10)" />
</div>
${liveMemTotal > 0 && html`
<div style="display:flex;align-items:center;gap:8px">
<div style="flex:1"><${Gauge} label="Memory" value=${liveMemUsed} max=${liveMemTotal} format="bytes" /></div>
${resourceHistory.length >= 2 && html`
<${Sparkline} values=${resourceHistory.map(s => parseInt(s.memoryUsedBytes || 0))}
max=${liveMemTotal} width=${80} height=${24}
color="var(--color-success)"
fillColor="rgba(26,127,55,0.1)" />
`}
</div>

${liveMemTotal > 0 && html`
<div class="utilization-metric">
<div class="utilization-metric__header">
<span class="utilization-metric__label">Memory</span>
<span class="utilization-metric__value utilization-metric__value--success">${formatBytes(liveMemUsed) + ' / ' + formatBytes(liveMemTotal)}</span>
</div>
<div class="utilization-metric__chart">
<${Sparkline} values=${resourceHistory.map(s => parseInt(s.memoryUsedBytes || 0))}
max=${liveMemTotal} width=${240} height=${40}
color="var(--color-success)"
fillColor="rgba(26,127,55,0.10)" />
</div>
`}
${liveDiskTotal > 0 && html`
<div style="display:flex;align-items:center;gap:8px">
<div style="flex:1"><${Gauge} label="Disk" value=${liveDiskUsed} max=${liveDiskTotal} format="bytes" /></div>
${resourceHistory.length >= 2 && html`
<${Sparkline} values=${resourceHistory.map(s => parseInt(s.diskUsedBytes || 0))}
max=${liveDiskTotal} width=${80} height=${24}
color="var(--color-warning)"
fillColor="rgba(191,135,0,0.1)" />
`}
</div>
`}

${liveDiskTotal > 0 && html`
<div class="utilization-metric">
<div class="utilization-metric__header">
<span class="utilization-metric__label">Disk</span>
<span class="utilization-metric__value utilization-metric__value--warning">${formatBytes(liveDiskUsed) + ' / ' + formatBytes(liveDiskTotal)}</span>
</div>
`}
<div style="display:flex;align-items:center;gap:8px">
<div style="flex:1">
<div class="gauge">
<span class="gauge-label">Net</span>
<span class="gauge-value" style="flex:1;text-align:right;font-variant-numeric:tabular-nums">
${'↓ ' + formatRate(liveNetRecv) + ' ↑ ' + formatRate(liveNetSent)}
</span>
</div>
<div class="utilization-metric__chart">
<${Sparkline} values=${resourceHistory.map(s => parseInt(s.diskUsedBytes || 0))}
max=${liveDiskTotal} width=${240} height=${40}
color="var(--color-warning)"
fillColor="rgba(154,103,0,0.10)" />
</div>
${resourceHistory.length >= 2 && html`
<${Sparkline} values=${resourceHistory.map(s => parseInt(s.netRecvBps || 0) + parseInt(s.netSentBps || 0))}
width=${80} height=${24}
color="var(--color-purple, #8250df)"
fillColor="rgba(130,80,223,0.1)" />
`}
</div>
<//>
`}
${!liveRes && html`
<dl class="worker-detail-fields">
${workerMem > 0 && html`<${Field} label="Memory" value=${formatBytes(workerMem)} />`}
${workerDisk > 0 && html`<${Field} label="Disk" value=${formatBytes(workerDisk)} />`}
</dl>
`}
<//>
</div>
`}

<div class="utilization-metric">
<div class="utilization-metric__header">
<span class="utilization-metric__label">Network</span>
<span class="utilization-metric__value utilization-metric__value--purple">
${'↓ ' + formatRate(liveNetRecv) + ' ↑ ' + formatRate(liveNetSent)}
</span>
</div>
<div class="utilization-metric__chart">
<${Sparkline} values=${resourceHistory.map(s => parseInt(s.netRecvBps || 0) + parseInt(s.netSentBps || 0))}
width=${240} height=${40}
color="var(--color-purple, #8250df)"
fillColor="rgba(130,80,223,0.10)" />
</div>
</div>
</div>
</div>
`}

${'' /* --- Task History --- */}
<div class="worker-detail-logs-section">
Expand Down
17 changes: 11 additions & 6 deletions lib/iris/src/iris/cluster/static/shared/components.js
Original file line number Diff line number Diff line change
Expand Up @@ -99,18 +99,23 @@ export function InlineGauge({ value, max, format = 'percent', label }) {
* @param {string} [fillColor] - Optional fill color under the line
*/
export function Sparkline({ values, max, width = 64, height = 20, color = 'var(--color-accent)', fillColor }) {
if (!values || values.length < 2) return null;
const effectiveMax = max || Math.max(...values);
if (effectiveMax === 0) return null;
if (!values || values.length < 1) return null;

// Duplicate single values to render a flat baseline instead of hiding
const data = values.length === 1 ? [values[0], values[0]] : values;
const effectiveMax = max || Math.max(...data);

// Pad from edges so the line doesn't clip
const pad = 1;
const innerW = width - 2 * pad;
const innerH = height - 2 * pad;

const points = values.map((v, i) => {
const x = pad + (i / (values.length - 1)) * innerW;
const y = pad + innerH - (Math.min(v, effectiveMax) / effectiveMax) * innerH;
const points = data.map((v, i) => {
const x = pad + (i / (data.length - 1)) * innerW;
// When max is 0 (all idle), draw a flat line at the bottom
const y = effectiveMax === 0
? pad + innerH
: pad + innerH - (Math.min(v, effectiveMax) / effectiveMax) * innerH;
return `${x.toFixed(1)},${y.toFixed(1)}`;
});

Expand Down
73 changes: 73 additions & 0 deletions lib/iris/src/iris/cluster/static/shared/styles.css
Original file line number Diff line number Diff line change
Expand Up @@ -1096,6 +1096,79 @@ input:focus, select:focus { outline: none; border-color: #0969da; box-shadow: 0
white-space: nowrap;
}

/* === Live Utilization Panel (worker detail) === */
.utilization-panel {
background: var(--color-surface);
border: 1px solid var(--color-border);
border-radius: var(--radius-lg);
padding: 20px 24px;
margin-bottom: 24px;
}

.utilization-panel__title {
font-size: 13px;
font-weight: 600;
color: var(--color-text-secondary);
text-transform: uppercase;
letter-spacing: 0.5px;
margin: 0 0 16px 0;
}

.utilization-panel__grid {
display: grid;
grid-template-columns: repeat(2, 1fr);
gap: 16px;
}

@media (max-width: 640px) {
.utilization-panel__grid { grid-template-columns: 1fr; }
}

.utilization-metric {
display: flex;
flex-direction: column;
gap: 8px;
background: var(--color-bg);
border-radius: var(--radius-md);
padding: 14px 16px;
}

.utilization-metric__header {
display: flex;
justify-content: space-between;
align-items: baseline;
}

.utilization-metric__label {
font-size: 11px;
font-weight: 600;
color: var(--color-text-secondary);
text-transform: uppercase;
letter-spacing: 0.5px;
}

.utilization-metric__value {
font-size: 14px;
font-weight: 600;
font-family: var(--font-mono);
font-variant-numeric: tabular-nums;
}

.utilization-metric__value--accent { color: var(--color-accent); }
.utilization-metric__value--success { color: var(--color-success); }
.utilization-metric__value--warning { color: var(--color-warning); }
.utilization-metric__value--danger { color: var(--color-danger); }
.utilization-metric__value--purple { color: var(--color-purple); }

.utilization-metric__chart {
display: flex;
}

.utilization-metric__chart .sparkline {
width: 100%;
height: 40px;
}

/* === Sparkline (inline trend chart) === */
.sparkline {
display: inline-block;
Expand Down
62 changes: 57 additions & 5 deletions lib/iris/tests/e2e/test_dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
rendering via screenshots.
"""

import time

import pytest
from iris.rpc import cluster_pb2

Expand Down Expand Up @@ -142,17 +144,69 @@ def test_worker_detail_shows_network_and_disk_sparklines(cluster, page, screensh

if not _is_noop_page(page):
page.wait_for_function(
"() => document.querySelector('.resource-section') !== null",
"() => document.querySelector('.utilization-panel') !== null",
timeout=10000,
)
# Live Utilization section should show CPU, Memory, Disk (all with sparklines) and Net
# Live Utilization panel should show CPU, Memory, Disk, and Network
assert_visible(page, "text=Live Utilization")
assert_visible(page, "text=CPU")
assert_visible(page, "text=Disk")
assert_visible(page, "text=Net")
assert_visible(page, "text=Network")
screenshot("worker-detail-net-disk-sparklines")


def _hold_for_heartbeats():
"""Sleep long enough for multiple heartbeat cycles to accumulate resource history."""
import time

time.sleep(6)
return 1


def test_worker_detail_sparklines_with_history(cluster, page, screenshot):
"""Worker detail sparklines render once resource history has accumulated.

Submits a task that holds long enough for multiple heartbeat cycles (local
heartbeat_interval=0.5s) so the resource_history deque has enough entries
for the Sparkline SVGs to render.
"""
job = cluster.submit(_hold_for_heartbeats, "worker-sparkline-history")
cluster.wait_for_state(job, cluster_pb2.JOB_STATE_RUNNING, timeout=15)

task_status = cluster.task_status(job)
worker_id = task_status.worker_id
assert worker_id

# Wait for heartbeats to accumulate (local interval is 0.5s, need >=2 entries)
time.sleep(3)

dashboard_goto(page, f"{cluster.url}/worker/{worker_id}")

if not _is_noop_page(page):
# Wait for the utilization panel with SVG sparklines to render
page.wait_for_function(
"() => document.querySelector('.utilization-panel') !== null"
" && document.querySelectorAll('.utilization-panel svg.sparkline').length >= 3",
timeout=15000,
)

# Verify all four utilization metric sections are present
assert_visible(page, "text=Live Utilization")
assert_visible(page, "text=CPU")
assert_visible(page, "text=Memory")
assert_visible(page, "text=Disk")
assert_visible(page, "text=Network")

# Verify SVG sparklines rendered (not hidden due to empty data)
if not _is_noop_page(page):
sparkline_count = page.locator(".utilization-panel svg.sparkline").count()
assert sparkline_count >= 3, f"Expected at least 3 sparkline SVGs in utilization panel, got {sparkline_count}"

screenshot("worker-detail-sparklines-with-history")

cluster.wait(job, timeout=30)


def _allocate_memory_and_wait():
"""Allocate ~50 MB and sleep long enough for at least one stats collection cycle."""
import time
Expand Down Expand Up @@ -192,8 +246,6 @@ def test_job_detail_task_table_shows_resource_values(cluster, page, screenshot):
cluster.wait_for_state(job, cluster_pb2.JOB_STATE_RUNNING, timeout=15)

# Wait for stats collection (poll interval is 5s)
import time

time.sleep(7)

dashboard_goto(page, f"{cluster.url}/job/{job.job_id.to_wire()}")
Expand Down