Skip to content

Commit e58a6f1

Browse files
authored
[HUD] Add GitHub Actions runners monitoring dashboard (#7082)
Adds a page for runners monitoring for github organizations: `/runners/[org]`, replicating the read-only functionality from the github runners admin page: Includes search, filtering, and URL editing functionality The logic to find the label to group the runners by is kinda wonky and custom fit to our current set of runners, but given that we have no master list of all runners offered by anyone, this seems like the best way to do it for the moment Future work: - Enable this to display runners at a per-repo level. I was running into authorization failures when I tried that, It seems like our github app might not have access to read the repo runners yet.
1 parent 61b9e2d commit e58a6f1

File tree

7 files changed

+857
-2
lines changed

7 files changed

+857
-2
lines changed

torchci/components/layout/NavBar.tsx

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,10 @@ function NavBar() {
178178
name: "Utilization Workflow Report",
179179
href: "/utilization/report?group_by=workflow_name",
180180
},
181+
{
182+
name: "PyTorch Runners",
183+
href: "/runners/pytorch",
184+
},
181185
];
182186

183187
const metricsDropdown = [
Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
/**
2+
* @fileoverview Expandable card component for displaying grouped GitHub Actions runners
3+
*
4+
* This component displays a group of GitHub Actions runners in a collapsible card format.
5+
* Each card shows summary statistics (idle, busy, offline counts) and can be expanded to
6+
* reveal a detailed table of all runners in the group.
7+
*
8+
* Props:
9+
* - group: RunnerGroup data containing runners and metadata
10+
* - searchTerm: Filter string to highlight matching runners
11+
* - isExpanded: Controls whether the detailed view is shown
12+
* - onExpandChange: Callback when expand/collapse state changes
13+
*
14+
*/
15+
16+
import { ExpandLess, ExpandMore } from "@mui/icons-material";
17+
import {
18+
Box,
19+
Card,
20+
CardContent,
21+
Chip,
22+
Collapse,
23+
IconButton,
24+
Table,
25+
TableBody,
26+
TableCell,
27+
TableContainer,
28+
TableHead,
29+
TableRow,
30+
Typography,
31+
useTheme,
32+
} from "@mui/material";
33+
import { RunnerGroup } from "lib/runnerUtils";
34+
import { useMemo } from "react";
35+
import { StatusChip } from "./StatusChip";
36+
37+
export function RunnerGroupCard({
38+
group,
39+
searchTerm,
40+
isExpanded,
41+
onExpandChange,
42+
}: {
43+
group: RunnerGroup;
44+
searchTerm: string;
45+
isExpanded: boolean;
46+
onExpandChange: (expanded: boolean) => void;
47+
}) {
48+
const theme = useTheme();
49+
50+
// Filter runners based on search term
51+
const filteredRunners = useMemo(() => {
52+
if (!searchTerm) return group.runners;
53+
54+
const term = searchTerm.toLowerCase();
55+
return group.runners.filter(
56+
(runner) =>
57+
runner.name.toLowerCase().includes(term) ||
58+
runner.id.toString().includes(term) ||
59+
runner.os.toLowerCase().includes(term) ||
60+
runner.labels.some((label) => label.name.toLowerCase().includes(term))
61+
);
62+
}, [group.runners, searchTerm]);
63+
64+
const handleExpandClick = () => {
65+
onExpandChange(!isExpanded);
66+
};
67+
68+
return (
69+
<Card
70+
sx={{
71+
mb: 2,
72+
minHeight: 120,
73+
display: "flex",
74+
flexDirection: "column",
75+
backgroundColor:
76+
theme.palette.mode === "dark"
77+
? theme.palette.warning.dark + "20" // Dark mode with opacity
78+
: theme.palette.warning.light + "20", // Light mode with opacity
79+
"&:hover": {
80+
backgroundColor:
81+
theme.palette.mode === "dark"
82+
? theme.palette.warning.dark + "30"
83+
: theme.palette.warning.light + "30",
84+
opacity: 0.9,
85+
},
86+
}}
87+
>
88+
<CardContent>
89+
<Box
90+
display="flex"
91+
justifyContent="space-between"
92+
alignItems="center"
93+
onClick={handleExpandClick}
94+
sx={{ cursor: "pointer" }}
95+
>
96+
<Box>
97+
<Typography variant="h6" component="div">
98+
{group.label} ({filteredRunners.length} runners)
99+
</Typography>
100+
<Box display="flex" gap={1} mt={1}>
101+
<Chip
102+
label={`${group.idleCount} idle`}
103+
color="success"
104+
size="small"
105+
/>
106+
<Chip
107+
label={`${group.busyCount} busy`}
108+
color="warning"
109+
size="small"
110+
/>
111+
<Chip
112+
label={`${group.offlineCount} offline`}
113+
color="default"
114+
size="small"
115+
/>
116+
</Box>
117+
</Box>
118+
<IconButton>
119+
{isExpanded ? <ExpandLess /> : <ExpandMore />}
120+
</IconButton>
121+
</Box>
122+
123+
<Collapse in={isExpanded} timeout="auto" unmountOnExit>
124+
<Box mt={2}>
125+
<TableContainer>
126+
<Table size="small">
127+
<TableHead>
128+
<TableRow>
129+
<TableCell>Name</TableCell>
130+
<TableCell>Status</TableCell>
131+
<TableCell>ID</TableCell>
132+
<TableCell>OS</TableCell>
133+
<TableCell>Labels</TableCell>
134+
</TableRow>
135+
</TableHead>
136+
<TableBody>
137+
{filteredRunners.map((runner) => (
138+
<TableRow key={runner.id}>
139+
<TableCell>{runner.name}</TableCell>
140+
<TableCell>
141+
<StatusChip runner={runner} />
142+
</TableCell>
143+
<TableCell>{runner.id}</TableCell>
144+
<TableCell>{runner.os}</TableCell>
145+
<TableCell>
146+
<Box display="flex" flexWrap="wrap" gap={0.5}>
147+
{runner.labels.map((label, index) => (
148+
<Chip
149+
key={index}
150+
label={label.name}
151+
size="small"
152+
variant="outlined"
153+
/>
154+
))}
155+
</Box>
156+
</TableCell>
157+
</TableRow>
158+
))}
159+
</TableBody>
160+
</Table>
161+
</TableContainer>
162+
</Box>
163+
</Collapse>
164+
</CardContent>
165+
</Card>
166+
);
167+
}
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
/**
2+
* @fileoverview Status chip component for GitHub Actions runners
3+
*
4+
* Displays the current status of a GitHub Actions runner
5+
*
6+
*/
7+
8+
import { Chip } from "@mui/material";
9+
import { RunnerData } from "lib/runnerUtils";
10+
11+
export function StatusChip({ runner }: { runner: RunnerData }) {
12+
let color: "success" | "warning" | "default";
13+
let label: string;
14+
15+
if (runner.status === "offline") {
16+
color = "default";
17+
label = "offline";
18+
} else if (runner.busy) {
19+
color = "warning";
20+
label = "busy";
21+
} else {
22+
color = "success";
23+
label = "idle";
24+
}
25+
26+
return (
27+
<Chip
28+
label={label}
29+
color={color}
30+
size="small"
31+
sx={{ minWidth: 80, fontWeight: "bold" }}
32+
/>
33+
);
34+
}

torchci/lib/runnerUtils.ts

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
// Shared utilities for GitHub runners functionality
2+
3+
// Types
4+
export interface RunnerData {
5+
id: number;
6+
name: string;
7+
os: string;
8+
status: "online" | "offline";
9+
busy: boolean;
10+
labels: Array<{
11+
id?: number;
12+
name: string;
13+
type: "read-only" | "custom";
14+
}>;
15+
}
16+
17+
export interface RunnerGroup {
18+
label: string;
19+
totalCount: number;
20+
idleCount: number;
21+
busyCount: number;
22+
offlineCount: number;
23+
runners: RunnerData[];
24+
}
25+
26+
export interface RunnersApiResponse {
27+
groups: RunnerGroup[];
28+
totalRunners: number;
29+
}
30+
31+
// Utility functions
32+
export function getRunnerGroupLabel(runner: RunnerData): string {
33+
const labelNames = runner.labels.map((label) => label.name);
34+
35+
// Find labels with "." (excluding any that end with ".runners") or starting with "macos-"
36+
// Why have such funky logic? We have many labels on our runners today, but this
37+
// is what's common in all the ones that jobs actually use.
38+
const validLabels = labelNames.filter(
39+
(name) =>
40+
(name.includes(".") && !name.endsWith(".runners")) || // "*.runners" is added to autoscaled runners
41+
name.startsWith("macos-")
42+
);
43+
44+
if (validLabels.length > 0) {
45+
// Handle synonyms. Today these are used by macOS runners which have two
46+
// labels that runners could potentially use instead of just one.
47+
// The synonymous labels tend to look like "macos-m1-14" and "macos-m1-stable"
48+
// If we end up in this situation, assume all valid labels are valid synonyms
49+
// and treat them as such.
50+
return validLabels.join(" / ");
51+
}
52+
53+
// Fallback: Parse runner name for grouping info
54+
// Special case for ROCm runners provided by that don't have proper GitHub labels
55+
// but use naming conventions like: linux.rocm.gpu.gfx942.1-xxxx-runner-xxxxx
56+
const runnerName = runner.name;
57+
58+
// Look for dotted prefixes before "-" followed by random suffix
59+
const namePatterns = [
60+
/^([a-z]+\.[a-z0-9.]+)-[a-z0-9]+/i, // linux.rocm.gpu.gfx942.1-xxxx
61+
/^([a-z]+\.[a-z0-9.]+\.[a-z0-9]+)/i, // linux.rocm.gpu prefix
62+
];
63+
64+
for (const pattern of namePatterns) {
65+
const match = runnerName.match(pattern);
66+
if (match) {
67+
return match[1]; // Return the prefix part
68+
}
69+
}
70+
71+
// If name starts with a dotted pattern, extract it
72+
if (runnerName.includes(".")) {
73+
const parts = runnerName.split("-");
74+
if (parts[0].includes(".")) {
75+
return parts[0];
76+
}
77+
}
78+
79+
return "unknown";
80+
}
81+
82+
// Helper function for sorting - pushes "unknown" labels to the end
83+
export function unknownGoesLast(
84+
a: { label: string },
85+
b: { label: string }
86+
): number {
87+
if (a.label === "unknown" && b.label !== "unknown") return 1;
88+
if (a.label !== "unknown" && b.label === "unknown") return -1;
89+
return 0;
90+
}
91+
92+
export function groupRunners(runners: RunnerData[]): RunnerGroup[] {
93+
const groups = new Map<string, RunnerData[]>();
94+
95+
// Group runners by label
96+
for (const runner of runners) {
97+
const label = getRunnerGroupLabel(runner);
98+
if (!groups.has(label)) {
99+
groups.set(label, []);
100+
}
101+
groups.get(label)!.push(runner);
102+
}
103+
104+
// Convert to RunnerGroup format with counts
105+
const result: RunnerGroup[] = [];
106+
for (const [label, groupRunners] of groups.entries()) {
107+
const idleCount = groupRunners.filter(
108+
(r) => r.status === "online" && !r.busy
109+
).length;
110+
const busyCount = groupRunners.filter(
111+
(r) => r.status === "online" && r.busy
112+
).length;
113+
const offlineCount = groupRunners.filter(
114+
(r) => r.status === "offline"
115+
).length;
116+
117+
// Helper function to get status priority: idle (0), busy (1), offline (2)
118+
const getStatusPriority = (runner: RunnerData): number => {
119+
if (runner.status === "offline") return 2;
120+
if (runner.status === "online" && runner.busy) return 1;
121+
return 0; // idle
122+
};
123+
124+
// Sort runners by status (idle, busy, offline) then by name
125+
const sortedRunners = groupRunners.sort((a, b) => {
126+
// First compare by status priority
127+
const statusComparison = getStatusPriority(a) - getStatusPriority(b);
128+
129+
// If status is the same, sort by name
130+
return statusComparison !== 0
131+
? statusComparison
132+
: a.name.localeCompare(b.name);
133+
});
134+
135+
result.push({
136+
label,
137+
totalCount: groupRunners.length,
138+
idleCount,
139+
busyCount,
140+
offlineCount,
141+
runners: sortedRunners,
142+
});
143+
}
144+
145+
// Sort groups by unknown status first, then by total count (descending)
146+
result.sort((a, b) => {
147+
const unknownComparison = unknownGoesLast(a, b);
148+
return unknownComparison !== 0
149+
? unknownComparison
150+
: b.totalCount - a.totalCount;
151+
});
152+
153+
return result;
154+
}

torchci/next-env.d.ts

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
/// <reference types="next" />
22
/// <reference types="next/image-types/global" />
33
/// <reference types="next/navigation-types/compat/navigation" />
4-
/// <reference path="./.next/types/routes.d.ts" />
54

65
// NOTE: This file should not be edited
7-
// see https://nextjs.org/docs/app/api-reference/config/typescript for more information.
6+
// see https://nextjs.org/docs/app/building-your-application/configuring/typescript for more information.

0 commit comments

Comments
 (0)