Skip to content

Commit 3667df2

Browse files
committed
add statistics notebook
1 parent 1512df9 commit 3667df2

File tree

1 file changed

+317
-0
lines changed

1 file changed

+317
-0
lines changed

notebooks/statistics.ipynb

Lines changed: 317 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,317 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# Table of Content\n",
8+
"1. [Setup](#Setup)\n",
9+
"2. [Inspecting download counts](#Inspecting-download-counts)\n",
10+
"3. [Contributed resources](#Contributed-resources)\n",
11+
"4. [Average processing time of proposed contributions](#Average-processing-time-of-proposed-contributions)\n",
12+
"5. [Cleanup](#cleanup)\n"
13+
]
14+
},
15+
{
16+
"cell_type": "markdown",
17+
"metadata": {},
18+
"source": [
19+
"# Setup "
20+
]
21+
},
22+
{
23+
"cell_type": "code",
24+
"execution_count": null,
25+
"metadata": {},
26+
"outputs": [],
27+
"source": [
28+
"from tempfile import mkdtemp\n",
29+
"from pathlib import Path\n",
30+
"from shutil import rmtree\n",
31+
"from subprocess import run\n",
32+
"\n",
33+
"import os\n",
34+
"import warnings\n",
35+
"\n",
36+
"def cleanup(folder):\n",
37+
" print(f\"Cleaning up {folder}\")\n",
38+
" try:\n",
39+
" rmtree(folder)\n",
40+
" except Exception as e:\n",
41+
" warnings.warn(str(e))\n",
42+
"\n",
43+
"if \"temp_dir\" in locals():\n",
44+
" cleanup(temp_dir)\n",
45+
"\n",
46+
"temp_dir = mkdtemp()\n",
47+
"\n",
48+
"os.chdir(temp_dir)\n",
49+
"run(\"git clone https://github.com/bioimage-io/collection-bioimage-io.git --branch gh-pages --single-branch\", check=True)\n",
50+
"os.chdir(\"collection-bioimage-io\")\n",
51+
"print(f\"working in {Path().absolute()}\")"
52+
]
53+
},
54+
{
55+
"cell_type": "markdown",
56+
"metadata": {},
57+
"source": [
58+
"# Inspecting download counts"
59+
]
60+
},
61+
{
62+
"cell_type": "code",
63+
"execution_count": null,
64+
"metadata": {},
65+
"outputs": [],
66+
"source": [
67+
"out = run('git log --pretty=format:\"%H,%aI\" download_counts.json', check=True, capture_output=True)\n",
68+
"log = out.stdout.decode().split()\n",
69+
"print(len(log), log[0])"
70+
]
71+
},
72+
{
73+
"cell_type": "code",
74+
"execution_count": null,
75+
"metadata": {},
76+
"outputs": [],
77+
"source": [
78+
"from datetime import date, datetime\n",
79+
"from typing import NewType, Dict\n",
80+
"\n",
81+
"Hash = NewType(\"Hash\", str)\n",
82+
"all_commits: Dict[date, Dict[datetime, Hash]] = {}\n",
83+
"hash: Hash\n",
84+
"for log_entry in log:\n",
85+
" hash, iso_datetime = log_entry.split(\",\")\n",
86+
" dt = datetime.fromisoformat(iso_datetime)\n",
87+
" d = dt.date()\n",
88+
" day = all_commits.setdefault(d, {})\n",
89+
" assert dt not in day\n",
90+
" day[dt] = hash\n",
91+
"\n",
92+
"commits: Dict[date, Hash] = {}\n",
93+
"for d, day in all_commits.items():\n",
94+
" commits[d] = max(day.items())[1]\n",
95+
"\n",
96+
"len(commits)"
97+
]
98+
},
99+
{
100+
"cell_type": "code",
101+
"execution_count": null,
102+
"metadata": {},
103+
"outputs": [],
104+
"source": [
105+
"from subprocess import CalledProcessError \n",
106+
"from tqdm import tqdm\n",
107+
"\n",
108+
"import json\n",
109+
"\n",
110+
"all_downloads: Dict[date, int] = {}\n",
111+
"try:\n",
112+
" for d, hash in tqdm(commits.items(), total=len(commits)):\n",
113+
" out = run(f\"git checkout --force {hash}\", check=True, capture_output=True)\n",
114+
" with Path(\"download_counts.json\").open() as f:\n",
115+
" counts = json.load(f)\n",
116+
" \n",
117+
" all_downloads[d] = sum(counts.values())\n",
118+
"except CalledProcessError:\n",
119+
" print(out.stdout.decode())\n",
120+
" raise\n",
121+
"finally:\n",
122+
" run(\"git checkout --force gh-pages\", check=True)"
123+
]
124+
},
125+
{
126+
"cell_type": "code",
127+
"execution_count": null,
128+
"metadata": {},
129+
"outputs": [],
130+
"source": [
131+
"import pandas as pd\n",
132+
"\n",
133+
"series = pd.Series(all_downloads, name=\"total downloads\")\n",
134+
"series.head()"
135+
]
136+
},
137+
{
138+
"cell_type": "code",
139+
"execution_count": null,
140+
"metadata": {},
141+
"outputs": [],
142+
"source": [
143+
"import matplotlib.pyplot as plt\n",
144+
"import seaborn as sns\n",
145+
"\n",
146+
"sns.set(style=\"darkgrid\", context=\"talk\")\n",
147+
"plt.style.use(\"dark_background\")\n",
148+
"\n",
149+
"fig, axs = plt.subplots(figsize=(16, 4))\n",
150+
"\n",
151+
"(series / 1e3).plot(kind='line',ax=axs, title=series.name)\n",
152+
"plt.xlabel(\"date\")\n",
153+
"plt.ylabel(\"10³\")\n",
154+
"plt.show()"
155+
]
156+
},
157+
{
158+
"cell_type": "markdown",
159+
"metadata": {},
160+
"source": [
161+
"# Contributed resources"
162+
]
163+
},
164+
{
165+
"cell_type": "code",
166+
"execution_count": null,
167+
"metadata": {},
168+
"outputs": [],
169+
"source": [
170+
"with Path(\"collection.json\").open() as f:\n",
171+
" collection = json.load(f)\n",
172+
"\n",
173+
"col = collection[\"collection\"]\n",
174+
"print(\"total:\", len(col))\n",
175+
"per_type = {}\n",
176+
"for e in col:\n",
177+
" t = e[\"type\"]\n",
178+
" per_type[t] = per_type.get(t, 0) + 1\n",
179+
"\n",
180+
"print(\"per type:\", per_type)"
181+
]
182+
},
183+
{
184+
"cell_type": "markdown",
185+
"metadata": {},
186+
"source": [
187+
"# Average processing time of proposed contributions\n",
188+
"\n",
189+
"Here we analyze the time it takes to close a generated PR that proposes to update the bioimage.io collection based on a new Zenodo record (version).\n",
190+
"\n",
191+
"These PRs are created by the [@bioimageiobot](https://github.com/bioimageiobot) and tagged with the 'auto-update' label.\n",
192+
"They have to be closed/merged by a (human) bioimage.io maintainer."
193+
]
194+
},
195+
{
196+
"cell_type": "code",
197+
"execution_count": null,
198+
"metadata": {},
199+
"outputs": [],
200+
"source": [
201+
"from pprint import pprint\n",
202+
"\n",
203+
"import os\n",
204+
"import requests\n",
205+
"\n",
206+
"url = \"https://api.github.com/graphql\"\n",
207+
"gh_token = os.getenv(\"GITHUB_TOKEN\")\n",
208+
"assert gh_token is not None, \"Missing env var 'GITHUB_TOKEN'\"\n",
209+
"query = \"\"\"\n",
210+
"{\n",
211+
" search(query: \"repo:bioimage-io/collection-bioimage-io is:pr author:bioimageiobot is:closed sort:created-desc\", type: ISSUE, first: 100) {\n",
212+
" edges {\n",
213+
" node {\n",
214+
" ... on PullRequest {\n",
215+
" createdAt\n",
216+
" closedAt\n",
217+
" }\n",
218+
" }\n",
219+
" }\n",
220+
" pageInfo {\n",
221+
" hasNextPage\n",
222+
" }\n",
223+
" }\n",
224+
"}\n",
225+
"\"\"\"\n",
226+
"r = requests.post(url, auth=(\"TOKEN\", gh_token), json={'query': query}).json()\n",
227+
"assert \"data\" in r, r\n",
228+
"data = r[\"data\"]\n",
229+
"edges = data[\"search\"][\"edges\"][::-1] # revert descending order to asceding\n",
230+
"\n",
231+
"start = edges[0]['node']['createdAt']\n",
232+
"end = edges[-1]['node']['closedAt']\n",
233+
"print(f\"{len(edges)} PRs from {start} to {end}\")"
234+
]
235+
},
236+
{
237+
"cell_type": "code",
238+
"execution_count": null,
239+
"metadata": {},
240+
"outputs": [],
241+
"source": [
242+
"from dateutil.parser import isoparse \n",
243+
"from numpy import busday_count, mean\n",
244+
"\n",
245+
"from holidays import country_holidays\n",
246+
"\n",
247+
"local_holidays = country_holidays(\"Germany\", subdiv=\"BW\")[start:end]\n",
248+
"\n",
249+
"_durations = {}\n",
250+
"for edge in edges:\n",
251+
" created = isoparse(edge[\"node\"][\"createdAt\"])\n",
252+
" closed = isoparse(edge[\"node\"][\"closedAt\"])\n",
253+
" delta = busday_count(created.date(), closed.date(), holidays=local_holidays)\n",
254+
" _durations[created] = delta\n",
255+
"\n",
256+
"dur_col = \"duration [work days in BW]\"\n",
257+
"durations = pd.DataFrame(_durations.items(), columns=(\"created\", dur_col))\n",
258+
"durations[dur_col].mean()"
259+
]
260+
},
261+
{
262+
"cell_type": "code",
263+
"execution_count": null,
264+
"metadata": {},
265+
"outputs": [],
266+
"source": [
267+
"fig, axes = plt.subplots(figsize=(16, 4))\n",
268+
"durations.plot(kind=\"scatter\", x=\"created\", y=dur_col, ax=axes)\n",
269+
"plt.show()"
270+
]
271+
},
272+
{
273+
"cell_type": "markdown",
274+
"metadata": {},
275+
"source": [
276+
"# Cleanup"
277+
]
278+
},
279+
{
280+
"cell_type": "code",
281+
"execution_count": null,
282+
"metadata": {},
283+
"outputs": [],
284+
"source": [
285+
"cleanup(temp_dir)"
286+
]
287+
},
288+
{
289+
"cell_type": "code",
290+
"execution_count": null,
291+
"metadata": {},
292+
"outputs": [],
293+
"source": []
294+
}
295+
],
296+
"metadata": {
297+
"kernelspec": {
298+
"display_name": "Python 3 (ipykernel)",
299+
"language": "python",
300+
"name": "python3"
301+
},
302+
"language_info": {
303+
"codemirror_mode": {
304+
"name": "ipython",
305+
"version": 3
306+
},
307+
"file_extension": ".py",
308+
"mimetype": "text/x-python",
309+
"name": "python",
310+
"nbconvert_exporter": "python",
311+
"pygments_lexer": "ipython3",
312+
"version": "3.8.17"
313+
}
314+
},
315+
"nbformat": 4,
316+
"nbformat_minor": 4
317+
}

0 commit comments

Comments
 (0)