Compute GPU runner usage stats (#6095)

ZainRizvi · web-flow · commit c9222754f262 · 2024-12-20T11:38:28.000-08:00
This code is meant for manual usage.

It computes p10, p90, etc statistics about how many instances we use of
each nvidia runner type

(I know, leaving it as a notebook means ugly diffs, but for ad-hoc data
queries this is way more functional)
diff --git a/tools/fleet_analysis/runner_usage_stats.ipynb b/tools/fleet_analysis/runner_usage_stats.ipynb
@@ -0,0 +1,243 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Code to infer the fleet usage statistics for our GPU machines"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import clickhouse_connect\n",
+    "import pandas as pd\n",
+    "import os\n",
+    "import re\n",
+    "import logging\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "# env loader\n",
+    "import dotenv\n",
+    "dotenv.load_dotenv()\n",
+    "\n",
+    "logging.basicConfig(level=logging.INFO)\n",
+    "logger = logging.getLogger()\n",
+    "logger.setLevel(\"INFO\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# set these variables in a local .env file:\n",
+    "\n",
+    "CLICKHOUSE_HOST = os.environ['CLICKHOUSE_HOST']\n",
+    "CLICKHOUSE_USER = os.environ['CLICKHOUSE_USER']\n",
+    "CLICKHOUSE_PASSWORD = os.environ['CLICKHOUSE_PASSWORD']\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client = clickhouse_connect.get_client(\n",
+    "    host=CLICKHOUSE_HOST,\n",
+    "    user=CLICKHOUSE_USER,\n",
+    "    password=CLICKHOUSE_PASSWORD,\n",
+    "    secure=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def clean_runner_type(runner_type):\n",
+    "    runner_type = re.sub(r'am2\\.', '', runner_type)\n",
+    "    runner_type = re.sub(r'amz2\\.', '', runner_type)\n",
+    "    runner_type = re.sub(r'amz2023\\.', '', runner_type)\n",
+    "    runner_type = re.sub(r'c\\.', '', runner_type)\n",
+    "    runner_type = re.sub(r'.canary$', '', runner_type)\n",
+    "    runner_type = re.sub(r'lf\\.', '', runner_type)\n",
+    "\n",
+    "    return runner_type\n",
+    "\n",
+    "\n",
+    "def get_nvidia_jobs_run(client, weeks_ago: int = 2):\n",
+    "    query = \"\"\"\n",
+    "    SELECT\n",
+    "        started_at,\n",
+    "        completed_at,\n",
+    "        age('minute', started_at, completed_at) AS duration_mins,\n",
+    "        arrayFirst(x -> x != 'self-hosted', labels) AS label,\n",
+    "        status,\n",
+    "        conclusion,\n",
+    "        name,\n",
+    "        url\n",
+    "    FROM\n",
+    "        workflow_job\n",
+    "    WHERE\n",
+    "        started_at >= subtractWeeks(now(), 2)\n",
+    "        AND length(arrayFilter(x -> x != 'self-hosted', labels)) > 0\n",
+    "        AND arrayFirst(x -> x != 'self-hosted', labels) LIKE '%gpu%'\n",
+    "        AND arrayFirst(x -> x != 'self-hosted', labels) LIKE '%nvidia%'\n",
+    "        AND arrayFirst(x -> x != 'self-hosted', labels) LIKE '%.%'\n",
+    "        AND status = 'completed'\n",
+    "    \"\"\"\n",
+    "\n",
+    "    data = client.query(query).result_set\n",
+    "\n",
+    "    df = pd.DataFrame(\n",
+    "        data,\n",
+    "        columns=[\n",
+    "            'started_at',\n",
+    "            'completed_at',\n",
+    "            'duration_mins',\n",
+    "            'label',\n",
+    "            'status',\n",
+    "            'conclusion',\n",
+    "            'name',\n",
+    "            'url']\n",
+    "    )\n",
+    "\n",
+    "    # clean the data\n",
+    "    df['started_at'] = pd.to_datetime(df['started_at'])\n",
+    "    df['completed_at'] = pd.to_datetime(df['completed_at'])\n",
+    "    df['duration_mins'] = df['duration_mins'].astype(int)\n",
+    "    df['label'] = df['label'].astype(str)\n",
+    "    df['status'] = df['status'].astype(str)\n",
+    "    df['conclusion'] = df['conclusion'].astype(str)\n",
+    "    df['name'] = df['name'].astype(str)\n",
+    "\n",
+    "    df['label'] = df['label'].apply(clean_runner_type)\n",
+    "\n",
+    "    return df\n",
+    "\n",
+    "\n",
+    "def get_runner_count_stats(job_run_df):\n",
+    "    # start when the first job was started_at\n",
+    "    start_time = job_run_df['started_at'].min()\n",
+    "    end_time = job_run_df['completed_at'].max()\n",
+    "    interval = pd.Timedelta(minutes=1)\n",
+    "    periods = pd.date_range(start=start_time, end=end_time, freq=interval)\n",
+    "\n",
+    "    # Initialize a DataFrame to store period stats\n",
+    "    period_stats = pd.DataFrame(index=periods)\n",
+    "\n",
+    "\n",
+    "    # For each unique label, at each time period we compute how many jobs are running in parallel\n",
+    "    for label in job_run_df['label'].unique():\n",
+    "        # Filter jobs by label\n",
+    "        label_df = job_run_df[job_run_df['label'] == label]\n",
+    "\n",
+    "        counts = []\n",
+    "        for period in periods:\n",
+    "            # Count jobs that are in progress during the interval\n",
+    "            count = label_df[(label_df['started_at'] <= period) & (label_df['completed_at'] > period)].shape[0]\n",
+    "            counts.append(count)\n",
+    "\n",
+    "        period_stats[label] = counts\n",
+    "\n",
+    "    return period_stats"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_weeks = 3\n",
+    "gpu_jobs_df = get_nvidia_jobs_run(client, num_weeks)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gpu_stats = get_runner_count_stats(gpu_jobs_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Compute the quantiles only for the weekdays (Monday = 0, Friday = 4)\n",
+    "# For each label, get the p0, p5, p10, p90, p95, and p100 number of jobs in progress\n",
+    "quantiles = gpu_stats[gpu_stats.index.dayofweek < 5].quantile([0.1, 0.9, 0.95, 0.99, 1], axis=0).T\n",
+    "\n",
+    "# sort quantiles by key\n",
+    "quantiles = quantiles.sort_index()\n",
+    "quantiles"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Chart the gpu_stats over time\n",
+    "# X-axis: time\n",
+    "# Y-axis: number of jobs in progress\n",
+    "# Each label is a line on the chart\n",
+    "\n",
+    "# just plot the last week\n",
+    "gpu_week_stats = gpu_stats[gpu_stats.index >= gpu_stats.index.max() - pd.Timedelta(weeks=1)]\n",
+    "\n",
+    "plt.figure(figsize=(20, 10))\n",
+    "for label in gpu_week_stats.columns:\n",
+    "    plt.plot(gpu_week_stats.index, gpu_week_stats[label], label=label)\n",
+    "\n",
+    "plt.legend()\n",
+    "plt.title('Number of jobs in progress over time')\n",
+    "plt.xlabel('Time')\n",
+    "plt.ylabel('Number of jobs in progress')\n",
+    "plt.show()\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}