Skip to content

Commit c922275

Browse files
authored
Compute GPU runner usage stats (#6095)
This code is meant for manual usage. It computes p10, p90, etc statistics about how many instances we use of each nvidia runner type (I know, leaving it as a notebook means ugly diffs, but for ad-hoc data queries this is way more functional)
1 parent 634a4c6 commit c922275

File tree

1 file changed

+243
-0
lines changed

1 file changed

+243
-0
lines changed
Lines changed: 243 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,243 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"Code to infer the fleet usage statistics for our GPU machines"
8+
]
9+
},
10+
{
11+
"cell_type": "code",
12+
"execution_count": null,
13+
"metadata": {},
14+
"outputs": [],
15+
"source": [
16+
"import clickhouse_connect\n",
17+
"import pandas as pd\n",
18+
"import os\n",
19+
"import re\n",
20+
"import logging\n",
21+
"import matplotlib.pyplot as plt\n",
22+
"\n",
23+
"# env loader\n",
24+
"import dotenv\n",
25+
"dotenv.load_dotenv()\n",
26+
"\n",
27+
"logging.basicConfig(level=logging.INFO)\n",
28+
"logger = logging.getLogger()\n",
29+
"logger.setLevel(\"INFO\")"
30+
]
31+
},
32+
{
33+
"cell_type": "code",
34+
"execution_count": null,
35+
"metadata": {},
36+
"outputs": [],
37+
"source": [
38+
"# set these variables in a local .env file:\n",
39+
"\n",
40+
"CLICKHOUSE_HOST = os.environ['CLICKHOUSE_HOST']\n",
41+
"CLICKHOUSE_USER = os.environ['CLICKHOUSE_USER']\n",
42+
"CLICKHOUSE_PASSWORD = os.environ['CLICKHOUSE_PASSWORD']\n"
43+
]
44+
},
45+
{
46+
"cell_type": "code",
47+
"execution_count": null,
48+
"metadata": {},
49+
"outputs": [],
50+
"source": [
51+
"client = clickhouse_connect.get_client(\n",
52+
" host=CLICKHOUSE_HOST,\n",
53+
" user=CLICKHOUSE_USER,\n",
54+
" password=CLICKHOUSE_PASSWORD,\n",
55+
" secure=True\n",
56+
")"
57+
]
58+
},
59+
{
60+
"cell_type": "code",
61+
"execution_count": null,
62+
"metadata": {},
63+
"outputs": [],
64+
"source": [
65+
"def clean_runner_type(runner_type):\n",
66+
" runner_type = re.sub(r'am2\\.', '', runner_type)\n",
67+
" runner_type = re.sub(r'amz2\\.', '', runner_type)\n",
68+
" runner_type = re.sub(r'amz2023\\.', '', runner_type)\n",
69+
" runner_type = re.sub(r'c\\.', '', runner_type)\n",
70+
" runner_type = re.sub(r'.canary$', '', runner_type)\n",
71+
" runner_type = re.sub(r'lf\\.', '', runner_type)\n",
72+
"\n",
73+
" return runner_type\n",
74+
"\n",
75+
"\n",
76+
"def get_nvidia_jobs_run(client, weeks_ago: int = 2):\n",
77+
" query = \"\"\"\n",
78+
" SELECT\n",
79+
" started_at,\n",
80+
" completed_at,\n",
81+
" age('minute', started_at, completed_at) AS duration_mins,\n",
82+
" arrayFirst(x -> x != 'self-hosted', labels) AS label,\n",
83+
" status,\n",
84+
" conclusion,\n",
85+
" name,\n",
86+
" url\n",
87+
" FROM\n",
88+
" workflow_job\n",
89+
" WHERE\n",
90+
" started_at >= subtractWeeks(now(), 2)\n",
91+
" AND length(arrayFilter(x -> x != 'self-hosted', labels)) > 0\n",
92+
" AND arrayFirst(x -> x != 'self-hosted', labels) LIKE '%gpu%'\n",
93+
" AND arrayFirst(x -> x != 'self-hosted', labels) LIKE '%nvidia%'\n",
94+
" AND arrayFirst(x -> x != 'self-hosted', labels) LIKE '%.%'\n",
95+
" AND status = 'completed'\n",
96+
" \"\"\"\n",
97+
"\n",
98+
" data = client.query(query).result_set\n",
99+
"\n",
100+
" df = pd.DataFrame(\n",
101+
" data,\n",
102+
" columns=[\n",
103+
" 'started_at',\n",
104+
" 'completed_at',\n",
105+
" 'duration_mins',\n",
106+
" 'label',\n",
107+
" 'status',\n",
108+
" 'conclusion',\n",
109+
" 'name',\n",
110+
" 'url']\n",
111+
" )\n",
112+
"\n",
113+
" # clean the data\n",
114+
" df['started_at'] = pd.to_datetime(df['started_at'])\n",
115+
" df['completed_at'] = pd.to_datetime(df['completed_at'])\n",
116+
" df['duration_mins'] = df['duration_mins'].astype(int)\n",
117+
" df['label'] = df['label'].astype(str)\n",
118+
" df['status'] = df['status'].astype(str)\n",
119+
" df['conclusion'] = df['conclusion'].astype(str)\n",
120+
" df['name'] = df['name'].astype(str)\n",
121+
"\n",
122+
" df['label'] = df['label'].apply(clean_runner_type)\n",
123+
"\n",
124+
" return df\n",
125+
"\n",
126+
"\n",
127+
"def get_runner_count_stats(job_run_df):\n",
128+
" # start when the first job was started_at\n",
129+
" start_time = job_run_df['started_at'].min()\n",
130+
" end_time = job_run_df['completed_at'].max()\n",
131+
" interval = pd.Timedelta(minutes=1)\n",
132+
" periods = pd.date_range(start=start_time, end=end_time, freq=interval)\n",
133+
"\n",
134+
" # Initialize a DataFrame to store period stats\n",
135+
" period_stats = pd.DataFrame(index=periods)\n",
136+
"\n",
137+
"\n",
138+
" # For each unique label, at each time period we compute how many jobs are running in parallel\n",
139+
" for label in job_run_df['label'].unique():\n",
140+
" # Filter jobs by label\n",
141+
" label_df = job_run_df[job_run_df['label'] == label]\n",
142+
"\n",
143+
" counts = []\n",
144+
" for period in periods:\n",
145+
" # Count jobs that are in progress during the interval\n",
146+
" count = label_df[(label_df['started_at'] <= period) & (label_df['completed_at'] > period)].shape[0]\n",
147+
" counts.append(count)\n",
148+
"\n",
149+
" period_stats[label] = counts\n",
150+
"\n",
151+
" return period_stats"
152+
]
153+
},
154+
{
155+
"cell_type": "code",
156+
"execution_count": null,
157+
"metadata": {},
158+
"outputs": [],
159+
"source": [
160+
"num_weeks = 3\n",
161+
"gpu_jobs_df = get_nvidia_jobs_run(client, num_weeks)"
162+
]
163+
},
164+
{
165+
"cell_type": "code",
166+
"execution_count": null,
167+
"metadata": {},
168+
"outputs": [],
169+
"source": [
170+
"gpu_stats = get_runner_count_stats(gpu_jobs_df)"
171+
]
172+
},
173+
{
174+
"cell_type": "code",
175+
"execution_count": null,
176+
"metadata": {},
177+
"outputs": [],
178+
"source": [
179+
"# Compute the quantiles only for the weekdays (Monday = 0, Friday = 4)\n",
180+
"# For each label, get the p0, p5, p10, p90, p95, and p100 number of jobs in progress\n",
181+
"quantiles = gpu_stats[gpu_stats.index.dayofweek < 5].quantile([0.1, 0.9, 0.95, 0.99, 1], axis=0).T\n",
182+
"\n",
183+
"# sort quantiles by key\n",
184+
"quantiles = quantiles.sort_index()\n",
185+
"quantiles"
186+
]
187+
},
188+
{
189+
"cell_type": "code",
190+
"execution_count": null,
191+
"metadata": {},
192+
"outputs": [],
193+
"source": [
194+
"# Chart the gpu_stats over time\n",
195+
"# X-axis: time\n",
196+
"# Y-axis: number of jobs in progress\n",
197+
"# Each label is a line on the chart\n",
198+
"\n",
199+
"# just plot the last week\n",
200+
"gpu_week_stats = gpu_stats[gpu_stats.index >= gpu_stats.index.max() - pd.Timedelta(weeks=1)]\n",
201+
"\n",
202+
"plt.figure(figsize=(20, 10))\n",
203+
"for label in gpu_week_stats.columns:\n",
204+
" plt.plot(gpu_week_stats.index, gpu_week_stats[label], label=label)\n",
205+
"\n",
206+
"plt.legend()\n",
207+
"plt.title('Number of jobs in progress over time')\n",
208+
"plt.xlabel('Time')\n",
209+
"plt.ylabel('Number of jobs in progress')\n",
210+
"plt.show()\n",
211+
"\n"
212+
]
213+
},
214+
{
215+
"cell_type": "code",
216+
"execution_count": null,
217+
"metadata": {},
218+
"outputs": [],
219+
"source": []
220+
}
221+
],
222+
"metadata": {
223+
"kernelspec": {
224+
"display_name": ".venv",
225+
"language": "python",
226+
"name": "python3"
227+
},
228+
"language_info": {
229+
"codemirror_mode": {
230+
"name": "ipython",
231+
"version": 3
232+
},
233+
"file_extension": ".py",
234+
"mimetype": "text/x-python",
235+
"name": "python",
236+
"nbconvert_exporter": "python",
237+
"pygments_lexer": "ipython3",
238+
"version": "3.10.9"
239+
}
240+
},
241+
"nbformat": 4,
242+
"nbformat_minor": 2
243+
}

0 commit comments

Comments
 (0)