[HUD] Add GitHub Actions runners monitoring dashboard (#7082)

ZainRizvi · web-flow · commit e58a6f16cc56 · 2025-09-03T11:46:39.000-05:00
Adds a page for runners monitoring for github organizations:
`/runners/[org]`, replicating the read-only functionality from the
github runners admin page:

Includes search, filtering, and URL editing functionality

The logic to find the label to group the runners by is kinda wonky and
custom fit to our current set of runners, but given that we have no
master list of all runners offered by anyone, this seems like the best
way to do it for the moment

Future work:
- Enable this to display runners at a per-repo level. I was running into
authorization failures when I tried that, It seems like our github app
might not have access to read the repo runners yet.
diff --git a/torchci/components/layout/NavBar.tsx b/torchci/components/layout/NavBar.tsx
@@ -178,6 +178,10 @@ function NavBar() {
       name: "Utilization Workflow Report",
       href: "/utilization/report?group_by=workflow_name",
     },
+    {
+      name: "PyTorch Runners",
+      href: "/runners/pytorch",
+    },
   ];
 
   const metricsDropdown = [
diff --git a/torchci/components/runners/RunnerGroupCard.tsx b/torchci/components/runners/RunnerGroupCard.tsx
@@ -0,0 +1,167 @@
+/**
+ * @fileoverview Expandable card component for displaying grouped GitHub Actions runners
+ *
+ * This component displays a group of GitHub Actions runners in a collapsible card format.
+ * Each card shows summary statistics (idle, busy, offline counts) and can be expanded to
+ * reveal a detailed table of all runners in the group.
+ *
+ * Props:
+ * - group: RunnerGroup data containing runners and metadata
+ * - searchTerm: Filter string to highlight matching runners
+ * - isExpanded: Controls whether the detailed view is shown
+ * - onExpandChange: Callback when expand/collapse state changes
+ *
+ */
+
+import { ExpandLess, ExpandMore } from "@mui/icons-material";
+import {
+  Box,
+  Card,
+  CardContent,
+  Chip,
+  Collapse,
+  IconButton,
+  Table,
+  TableBody,
+  TableCell,
+  TableContainer,
+  TableHead,
+  TableRow,
+  Typography,
+  useTheme,
+} from "@mui/material";
+import { RunnerGroup } from "lib/runnerUtils";
+import { useMemo } from "react";
+import { StatusChip } from "./StatusChip";
+
+export function RunnerGroupCard({
+  group,
+  searchTerm,
+  isExpanded,
+  onExpandChange,
+}: {
+  group: RunnerGroup;
+  searchTerm: string;
+  isExpanded: boolean;
+  onExpandChange: (expanded: boolean) => void;
+}) {
+  const theme = useTheme();
+
+  // Filter runners based on search term
+  const filteredRunners = useMemo(() => {
+    if (!searchTerm) return group.runners;
+
+    const term = searchTerm.toLowerCase();
+    return group.runners.filter(
+      (runner) =>
+        runner.name.toLowerCase().includes(term) ||
+        runner.id.toString().includes(term) ||
+        runner.os.toLowerCase().includes(term) ||
+        runner.labels.some((label) => label.name.toLowerCase().includes(term))
+    );
+  }, [group.runners, searchTerm]);
+
+  const handleExpandClick = () => {
+    onExpandChange(!isExpanded);
+  };
+
+  return (
+    <Card
+      sx={{
+        mb: 2,
+        minHeight: 120,
+        display: "flex",
+        flexDirection: "column",
+        backgroundColor:
+          theme.palette.mode === "dark"
+            ? theme.palette.warning.dark + "20" // Dark mode with opacity
+            : theme.palette.warning.light + "20", // Light mode with opacity
+        "&:hover": {
+          backgroundColor:
+            theme.palette.mode === "dark"
+              ? theme.palette.warning.dark + "30"
+              : theme.palette.warning.light + "30",
+          opacity: 0.9,
+        },
+      }}
+    >
+      <CardContent>
+        <Box
+          display="flex"
+          justifyContent="space-between"
+          alignItems="center"
+          onClick={handleExpandClick}
+          sx={{ cursor: "pointer" }}
+        >
+          <Box>
+            <Typography variant="h6" component="div">
+              {group.label} ({filteredRunners.length} runners)
+            </Typography>
+            <Box display="flex" gap={1} mt={1}>
+              <Chip
+                label={`${group.idleCount} idle`}
+                color="success"
+                size="small"
+              />
+              <Chip
+                label={`${group.busyCount} busy`}
+                color="warning"
+                size="small"
+              />
+              <Chip
+                label={`${group.offlineCount} offline`}
+                color="default"
+                size="small"
+              />
+            </Box>
+          </Box>
+          <IconButton>
+            {isExpanded ? <ExpandLess /> : <ExpandMore />}
+          </IconButton>
+        </Box>
+
+        <Collapse in={isExpanded} timeout="auto" unmountOnExit>
+          <Box mt={2}>
+            <TableContainer>
+              <Table size="small">
+                <TableHead>
+                  <TableRow>
+                    <TableCell>Name</TableCell>
+                    <TableCell>Status</TableCell>
+                    <TableCell>ID</TableCell>
+                    <TableCell>OS</TableCell>
+                    <TableCell>Labels</TableCell>
+                  </TableRow>
+                </TableHead>
+                <TableBody>
+                  {filteredRunners.map((runner) => (
+                    <TableRow key={runner.id}>
+                      <TableCell>{runner.name}</TableCell>
+                      <TableCell>
+                        <StatusChip runner={runner} />
+                      </TableCell>
+                      <TableCell>{runner.id}</TableCell>
+                      <TableCell>{runner.os}</TableCell>
+                      <TableCell>
+                        <Box display="flex" flexWrap="wrap" gap={0.5}>
+                          {runner.labels.map((label, index) => (
+                            <Chip
+                              key={index}
+                              label={label.name}
+                              size="small"
+                              variant="outlined"
+                            />
+                          ))}
+                        </Box>
+                      </TableCell>
+                    </TableRow>
+                  ))}
+                </TableBody>
+              </Table>
+            </TableContainer>
+          </Box>
+        </Collapse>
+      </CardContent>
+    </Card>
+  );
+}
diff --git a/torchci/components/runners/StatusChip.tsx b/torchci/components/runners/StatusChip.tsx
@@ -0,0 +1,34 @@
+/**
+ * @fileoverview Status chip component for GitHub Actions runners
+ *
+ * Displays the current status of a GitHub Actions runner
+ *
+ */
+
+import { Chip } from "@mui/material";
+import { RunnerData } from "lib/runnerUtils";
+
+export function StatusChip({ runner }: { runner: RunnerData }) {
+  let color: "success" | "warning" | "default";
+  let label: string;
+
+  if (runner.status === "offline") {
+    color = "default";
+    label = "offline";
+  } else if (runner.busy) {
+    color = "warning";
+    label = "busy";
+  } else {
+    color = "success";
+    label = "idle";
+  }
+
+  return (
+    <Chip
+      label={label}
+      color={color}
+      size="small"
+      sx={{ minWidth: 80, fontWeight: "bold" }}
+    />
+  );
+}
diff --git a/torchci/lib/runnerUtils.ts b/torchci/lib/runnerUtils.ts
@@ -0,0 +1,154 @@
+// Shared utilities for GitHub runners functionality
+
+// Types
+export interface RunnerData {
+  id: number;
+  name: string;
+  os: string;
+  status: "online" | "offline";
+  busy: boolean;
+  labels: Array<{
+    id?: number;
+    name: string;
+    type: "read-only" | "custom";
+  }>;
+}
+
+export interface RunnerGroup {
+  label: string;
+  totalCount: number;
+  idleCount: number;
+  busyCount: number;
+  offlineCount: number;
+  runners: RunnerData[];
+}
+
+export interface RunnersApiResponse {
+  groups: RunnerGroup[];
+  totalRunners: number;
+}
+
+// Utility functions
+export function getRunnerGroupLabel(runner: RunnerData): string {
+  const labelNames = runner.labels.map((label) => label.name);
+
+  // Find labels with "." (excluding any that end with ".runners") or starting with "macos-"
+  // Why have such funky logic? We have many labels on our runners today, but this
+  // is what's common in all the ones that jobs actually use.
+  const validLabels = labelNames.filter(
+    (name) =>
+      (name.includes(".") && !name.endsWith(".runners")) || // "*.runners" is added to autoscaled runners
+      name.startsWith("macos-")
+  );
+
+  if (validLabels.length > 0) {
+    // Handle synonyms. Today these are used by macOS runners which have two
+    // labels that runners could potentially use instead of just one.
+    // The synonymous labels tend to look like "macos-m1-14" and "macos-m1-stable"
+    // If we end up in this situation, assume all valid labels are valid synonyms
+    // and treat them as such.
+    return validLabels.join(" / ");
+  }
+
+  // Fallback: Parse runner name for grouping info
+  // Special case for ROCm runners provided by that don't have proper GitHub labels
+  // but use naming conventions like: linux.rocm.gpu.gfx942.1-xxxx-runner-xxxxx
+  const runnerName = runner.name;
+
+  // Look for dotted prefixes before "-" followed by random suffix
+  const namePatterns = [
+    /^([a-z]+\.[a-z0-9.]+)-[a-z0-9]+/i, // linux.rocm.gpu.gfx942.1-xxxx
+    /^([a-z]+\.[a-z0-9.]+\.[a-z0-9]+)/i, // linux.rocm.gpu prefix
+  ];
+
+  for (const pattern of namePatterns) {
+    const match = runnerName.match(pattern);
+    if (match) {
+      return match[1]; // Return the prefix part
+    }
+  }
+
+  // If name starts with a dotted pattern, extract it
+  if (runnerName.includes(".")) {
+    const parts = runnerName.split("-");
+    if (parts[0].includes(".")) {
+      return parts[0];
+    }
+  }
+
+  return "unknown";
+}
+
+// Helper function for sorting - pushes "unknown" labels to the end
+export function unknownGoesLast(
+  a: { label: string },
+  b: { label: string }
+): number {
+  if (a.label === "unknown" && b.label !== "unknown") return 1;
+  if (a.label !== "unknown" && b.label === "unknown") return -1;
+  return 0;
+}
+
+export function groupRunners(runners: RunnerData[]): RunnerGroup[] {
+  const groups = new Map<string, RunnerData[]>();
+
+  // Group runners by label
+  for (const runner of runners) {
+    const label = getRunnerGroupLabel(runner);
+    if (!groups.has(label)) {
+      groups.set(label, []);
+    }
+    groups.get(label)!.push(runner);
+  }
+
+  // Convert to RunnerGroup format with counts
+  const result: RunnerGroup[] = [];
+  for (const [label, groupRunners] of groups.entries()) {
+    const idleCount = groupRunners.filter(
+      (r) => r.status === "online" && !r.busy
+    ).length;
+    const busyCount = groupRunners.filter(
+      (r) => r.status === "online" && r.busy
+    ).length;
+    const offlineCount = groupRunners.filter(
+      (r) => r.status === "offline"
+    ).length;
+
+    // Helper function to get status priority: idle (0), busy (1), offline (2)
+    const getStatusPriority = (runner: RunnerData): number => {
+      if (runner.status === "offline") return 2;
+      if (runner.status === "online" && runner.busy) return 1;
+      return 0; // idle
+    };
+
+    // Sort runners by status (idle, busy, offline) then by name
+    const sortedRunners = groupRunners.sort((a, b) => {
+      // First compare by status priority
+      const statusComparison = getStatusPriority(a) - getStatusPriority(b);
+
+      // If status is the same, sort by name
+      return statusComparison !== 0
+        ? statusComparison
+        : a.name.localeCompare(b.name);
+    });
+
+    result.push({
+      label,
+      totalCount: groupRunners.length,
+      idleCount,
+      busyCount,
+      offlineCount,
+      runners: sortedRunners,
+    });
+  }
+
+  // Sort groups by unknown status first, then by total count (descending)
+  result.sort((a, b) => {
+    const unknownComparison = unknownGoesLast(a, b);
+    return unknownComparison !== 0
+      ? unknownComparison
+      : b.totalCount - a.totalCount;
+  });
+
+  return result;
+}
diff --git a/torchci/next-env.d.ts b/torchci/next-env.d.ts
@@ -1,7 +1,6 @@
 /// <reference types="next" />
 /// <reference types="next/image-types/global" />
 /// <reference types="next/navigation-types/compat/navigation" />
-/// <reference path="./.next/types/routes.d.ts" />
 
 // NOTE: This file should not be edited
-// see https://nextjs.org/docs/app/api-reference/config/typescript for more information.
+// see https://nextjs.org/docs/app/building-your-application/configuring/typescript for more information.
diff --git a/torchci/pages/api/runners/[org].ts b/torchci/pages/api/runners/[org].ts
diff --git a/torchci/pages/runners/[org].tsx b/torchci/pages/runners/[org].tsx