diff --git a/pr-metrics/.gitignore b/pr-metrics/.gitignore new file mode 100644 index 000000000..a8188de82 --- /dev/null +++ b/pr-metrics/.gitignore @@ -0,0 +1,4 @@ +__pycache__ +pr-data.p +*.png +*.csv diff --git a/pr-metrics/Readme.md b/pr-metrics/Readme.md new file mode 100644 index 000000000..febedd976 --- /dev/null +++ b/pr-metrics/Readme.md @@ -0,0 +1,65 @@ +These scripts collect some metrics about mbed TLS PRs over time. + +Usage +----- + +1. `./get-pr-data.py` - this takes a long time and requires the environment + variable `GITHUB_API_TOKEN` to be set to a valid [github API +token](https://help.github.com/en/github/authenticating-to-github/creating-a-personal-access-token) (unauthenticated access to the API has a limit on the number or requests that is too low for our number of PRs). It generates `pr-data.p` with pickled data. +2. `./do.sh` - this works offline from the data in + `pr-data.p` and generates a bunch of png and csv files. + +For example, the report for the last quarter can be generated with: +``` +./get-pr-data.py # assuming GITHUB_API_TOKEN is set in the environement +./do.sh +``` +Note that the metric "median lifetime" is special in that it can't always be +computed right after the quarter is over, it sometimes need more time to pass +and/or more PRs from that quarter to be closed. In that case, the uncertain +quarter(s) will shown with an error bar the png graph, and in the csv file an +interval will be reported for the value(s) that can't be determined yet. + +By default, data extends from start of 2020 to end of the previous quarter. It +is possible to change that range using environment variables, for example: +``` +PR_FIRST_DATE=2016-01-01 PR_LAST_DATE=2022-12-32 ./do.sh +``` +gives date from 2016 to 2022 included. + +Requirements +------------ + +These scripts require: + +- Python >= 3.6 (required by recent enough matplotlib) +- matplotlib >= 3.1 (3.0 doesn't work) +- PyGithub >= 1.43 (any version should work, that was just the oldest tested) + +### Ubuntu 20.04 (and probaly 18.04) + +A simple `apt install python3-github python3-matplotlib` is enough. + +### Ubuntu 16.04 + +On Ubuntu 16.04, by default only Python 3.5 is available, which doesn't +support a recent enough matplotlib to support those scripts, so the following +was used to run those scripts on 16.04: + + sudo add-apt-repository ppa:deadsnakes/ppa + sudo apt update + sudo apt install python3.6 python3.6-venv + python3.6 -m venv 36env + source 36env/bin/activate + pip install --upgrade pip + pip install matlplotlib + pip install pygithub + +See `requirements.txt` for an example of a set of working versions. + +Note: if you do this, I strongly recommend uninstalling python3.6, +python3.6-venv and all their dependencies, then removing the deadsnakes PPA +before any upgrade to 18.04. Failing to do so will result in +dependency-related headaches as some packages in 18.04 depend on a specific +version of python3.6 but the version from deadsnakes is higher, so apt won't +downgrade it and manual intervention will be required. diff --git a/pr-metrics/do.sh b/pr-metrics/do.sh new file mode 100755 index 000000000..5c64a809f --- /dev/null +++ b/pr-metrics/do.sh @@ -0,0 +1,9 @@ +#!/bin/sh + +set -eu + +for topic in created closed pending lifetime backlog; do + echo "PRs $topic..." + rm -f prs-${topic}.png prs-${topic}.csv + ./pr-${topic}.py > prs-${topic}.csv +done diff --git a/pr-metrics/get-pr-data.py b/pr-metrics/get-pr-data.py new file mode 100755 index 000000000..717ec4b31 --- /dev/null +++ b/pr-metrics/get-pr-data.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +"""Get PR data from github and pickle it.""" + +import pickle +import os + +from github import Github + +if "GITHUB_API_TOKEN" in os.environ: + token = os.environ["GITHUB_API_TOKEN"] +else: + print("You need to provide a GitHub API token") + +g = Github(token) +r = g.get_repo("ARMMbed/mbedtls") + +prs = list() +for p in r.get_pulls(state="all"): + print(p.number) + # Accessing p.mergeable forces completion of PR data (by default, only + # basic info such as status and dates is available) but makes things + # slower (about 10x). Only do that for open PRs; we don't need the extra + # info for old PRs (only the dates which are part of the basic info). + if p.state == 'open': + dummy = p.mergeable + prs.append(p) + +# After a branch has been updated, github doesn't immediately go and recompute +# potential conflicts for all open PRs against this branch; instead it does +# that when the info is requested and even then it's done asynchronously: the +# first request might return no data, but if we come back after we've done all +# the other PRs, the info should have become available in the meantime. +for p in prs: + if p.state == 'open' and p.mergeable is None: + print(p.number, 'update') + p.update() + +with open("pr-data.p", "wb") as f: + pickle.dump(prs, f) diff --git a/pr-metrics/pending-mergeability.py b/pr-metrics/pending-mergeability.py new file mode 100755 index 000000000..f6f8e34b1 --- /dev/null +++ b/pr-metrics/pending-mergeability.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +"""Produce summary or PRs pending per branch and their mergeability status.""" + +import pickle +from datetime import datetime +from collections import Counter + +with open("pr-data.p", "rb") as f: + prs = pickle.load(f) + +c_open = Counter() +c_mergeable = Counter() +c_recent = Counter() +c_recent2 = Counter() + +for p in prs: + if p.state != "open": + continue + + branch = p.base.ref + c_open[branch] += 1 + if p.mergeable: + c_mergeable[branch] += 1 + days = (datetime.now() - p.updated_at).days + if days < 31: + c_recent[branch] += 1 + if days < 8: + c_recent2[branch] += 1 + + +print(" branch: open, mergeable, <31d, <8d") +for b in sorted(c_open, key=lambda b: c_open[b], reverse=True): + print("{:>20}: {: 10}, {: 10}, {: 10}, {:10}".format( + b, c_open[b], c_mergeable[b], c_recent[b], c_recent2[b])) diff --git a/pr-metrics/pr-backlog.py b/pr-metrics/pr-backlog.py new file mode 100755 index 000000000..f3634221f --- /dev/null +++ b/pr-metrics/pr-backlog.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +"""Produce analysis of PR backlog over time""" + +from prs import pr_dates, first, last, quarter + +from datetime import datetime, timedelta +from collections import Counter +from itertools import chain + +import matplotlib.pyplot as plt + +new_days = 90 +old_days = 365 + +new = Counter() +med = Counter() +old = Counter() + +for beg, end, com in pr_dates(): + if end is None: + tomorrow = datetime.now().date() + timedelta(days=1) + n_days = (tomorrow - beg).days + else: + n_days = (end - beg).days + for i in range(n_days): + q = quarter(beg + timedelta(days=i)) + q1 = quarter(beg + timedelta(days=i+1)) + # Only count on each quarter's last day + if q == q1: + continue + if i <= new_days: + new[q] += 1 + elif i <= old_days: + med[q] += 1 + else: + old[q] += 1 + +first_q = quarter(first) +last_q = quarter(last) + +quarters = (q for q in chain(new, med, old) if first_q <= q <= last_q) +quarters = tuple(sorted(set(quarters))) + +new_y = tuple(new[q] for q in quarters) +med_y = tuple(med[q] for q in quarters) +old_y = tuple(old[q] for q in quarters) +sum_y = tuple(old[q] + med[q] for q in quarters) + +old_name = "older than {} days".format(old_days) +med_name = "medium" +new_name = "recent (less {} days old)".format(new_days) + +width = 0.9 +fig, ax = plt.subplots() +ax.bar(quarters, old_y, width, label=old_name) +ax.bar(quarters, med_y, width, label=med_name, bottom=old_y) +ax.bar(quarters, new_y, width, label=new_name, bottom=sum_y) +ax.legend(loc="upper left") +ax.grid(True) +ax.set_xlabel("quarter") +ax.set_ylabel("Number or PRs pending") +ax.tick_params(axis="x", labelrotation=90) +fig.suptitle("State of the PR backlog at the end of each quarter") +fig.set_size_inches(12.8, 7.2) # default 100 dpi -> 720p +fig.savefig("prs-backlog.png") + +print("Quarter,recent,medium,old,total") +for q in quarters: + print("{},{},{},{},{}".format(q, new[q], med[q], old[q], + new[q] + med[q] + old[q])) diff --git a/pr-metrics/pr-closed.py b/pr-metrics/pr-closed.py new file mode 100755 index 000000000..c12740d79 --- /dev/null +++ b/pr-metrics/pr-closed.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +"""Produce graph of PRs closed by time period.""" + +from prs import pr_dates, quarter, first, last + +from collections import Counter + +import matplotlib.pyplot as plt + +first_q = quarter(first) +last_q = quarter(last) + +cnt_all = Counter() +cnt_com = Counter() + +for beg, end, com in pr_dates(): + if end is None: + continue + q = quarter(end) + cnt_all[q] += 1 + if com: + cnt_com[q] += 1 + +quarters = tuple(sorted(q for q in cnt_all if first_q <= q <= last_q)) + +prs_com = tuple(cnt_com[q] for q in quarters) +prs_team = tuple(cnt_all[q] - cnt_com[q] for q in quarters) + +width = 0.9 +fig, ax = plt.subplots() +ax.bar(quarters, prs_com, width, label="community") +ax.bar(quarters, prs_team, width, label="core team", bottom=prs_com) +ax.legend(loc="upper left") +ax.grid(True) +ax.set_xlabel("quarter") +ax.set_ylabel("Number or PRs closed") +ax.tick_params(axis="x", labelrotation=90) +fig.suptitle("Number of PRs closed per quarter") +fig.set_size_inches(12.8, 7.2) # default 100 dpi -> 720p +fig.savefig("prs-closed.png") + +print("Quarter,community closed,total closed") +for q in quarters: + print("{},{},{}".format(q, cnt_com[q], cnt_all[q])) diff --git a/pr-metrics/pr-created.py b/pr-metrics/pr-created.py new file mode 100755 index 000000000..e2a187714 --- /dev/null +++ b/pr-metrics/pr-created.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +"""Produce graph of PRs created by time period.""" + +from prs import pr_dates, quarter, first, last + +from collections import Counter + +import matplotlib.pyplot as plt + +first_q = quarter(first) +last_q = quarter(last) + +cnt_all = Counter() +cnt_com = Counter() + +for beg, end, com in pr_dates(): + q = quarter(beg) + cnt_all[q] += 1 + if com: + cnt_com[q] += 1 + +quarters = tuple(sorted(q for q in cnt_all if first_q <= q <= last_q)) + +prs_com = tuple(cnt_com[q] for q in quarters) +prs_team = tuple(cnt_all[q] - cnt_com[q] for q in quarters) + +width = 0.9 +fig, ax = plt.subplots() +ax.bar(quarters, prs_com, width, label="community") +ax.bar(quarters, prs_team, width, label="core team", bottom=prs_com) +ax.legend(loc="upper left") +ax.grid(True) +ax.set_xlabel("quarter") +ax.set_ylabel("Number or PRs created") +ax.tick_params(axis="x", labelrotation=90) +fig.suptitle("Number of PRs created per quarter") +fig.set_size_inches(12.8, 7.2) # default 100 dpi -> 720p +fig.savefig("prs-created.png") + +print("Quarter,community created,total created") +for q in quarters: + print("{},{},{}".format(q, cnt_com[q], cnt_all[q])) diff --git a/pr-metrics/pr-lifetime.py b/pr-metrics/pr-lifetime.py new file mode 100755 index 000000000..fa01d44cc --- /dev/null +++ b/pr-metrics/pr-lifetime.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +"""Produce graph of lifetime of PRs over time.""" + +from prs import pr_dates, quarter, first, last + +from collections import defaultdict + +import matplotlib.pyplot as plt +from datetime import datetime +from statistics import median +import math + +first_q = quarter(first) +last_q = quarter(last) + +lifetimes_all_hi = defaultdict(list) +lifetimes_all_lo = defaultdict(list) +lifetimes_com_hi = defaultdict(list) +lifetimes_com_lo = defaultdict(list) + +today = datetime.now().date() +for beg, end, com in pr_dates(): + if end is None: + lo = (today - beg).days + hi = math.inf + else: + hi = lo = (end - beg).days + + q = quarter(beg) + lifetimes_all_hi[q].append(hi) + lifetimes_all_lo[q].append(lo) + if com: + lifetimes_com_hi[q].append(hi) + lifetimes_com_lo[q].append(lo) + +quarters = tuple(sorted(q for q in lifetimes_all_hi if first_q <= q <= last_q)) + +med_all_hi = tuple(median(lifetimes_all_hi[q]) for q in quarters) +med_all_lo = tuple(median(lifetimes_all_lo[q]) for q in quarters) +med_com_hi = tuple(median(lifetimes_com_hi[q]) for q in quarters) +med_com_lo = tuple(median(lifetimes_com_lo[q]) for q in quarters) + +l = len(quarters) +med_all = tuple((med_all_hi[i] + med_all_lo[i]) / 2 for i in range(l)) +med_com = tuple((med_com_hi[i] + med_com_lo[i]) / 2 for i in range(l)) +err_all = tuple((med_all_hi[i] - med_all_lo[i]) / 2 for i in range(l)) +err_com = tuple((med_com_hi[i] - med_com_lo[i]) / 2 for i in range(l)) + +fig, ax = plt.subplots() +ax.errorbar(quarters, med_all, yerr=err_all, fmt="b-", ecolor="r", label="median overall") +ax.errorbar(quarters, med_com, yerr=err_com, fmt="g-", ecolor="r", label="median community") +ax.legend(loc="upper left") +ax.grid(True) +ax.set_xlabel("quarter") +ax.set_ylabel("median lifetime in days of PRs created that quarter") +ax.tick_params(axis="x", labelrotation=90) +bot, top = ax.set_ylim() +ax.set_ylim(0, min(365, top)) # we don't care about values over 1 year +fig.suptitle("Median lifetime of PRs per quarter (less is better)") +fig.set_size_inches(12.8, 7.2) # default 100 dpi -> 720p +fig.savefig("prs-lifetime.png") + + +def interval(lo, hi): + if hi == lo: + return str(int(hi)) + if math.isinf(hi): + return "> " + str(int(lo)) + + return str(int(lo)) + "-" + str(int(hi)) + + +print("Quarter,median overall,median community") +for i in range(len(quarters)): + print( + "{},{},{}".format( + quarters[i], + interval(med_all_lo[i], med_all_hi[i]), + interval(med_com_lo[i], med_com_hi[i]), + ) + ) diff --git a/pr-metrics/pr-pending.py b/pr-metrics/pr-pending.py new file mode 100755 index 000000000..50aeac53c --- /dev/null +++ b/pr-metrics/pr-pending.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +"""Produce graph of PRs pending over time.""" + +from prs import pr_dates, first, last + +from datetime import datetime, timedelta +from collections import Counter + +import matplotlib.pyplot as plt + +cnt_tot = Counter() +cnt_com = Counter() + +for beg, end, com in pr_dates(): + if end is None: + tomorrow = datetime.now().date() + timedelta(days=1) + n_days = (tomorrow - beg).days + else: + n_days = (end - beg).days + dates = Counter(beg + timedelta(days=i) for i in range(n_days)) + cnt_tot.update(dates) + if com: + cnt_com.update(dates) + +dates = tuple(sorted(d for d in cnt_tot.keys() if first <= d <= last)) + + +def avg(cnt, date): + """Average number of open PRs over a week.""" + return sum(cnt[date - timedelta(days=i)] for i in range(7)) / 7 + + +nb_tot = tuple(avg(cnt_tot, d) for d in dates) +nb_com = tuple(avg(cnt_com, d) for d in dates) +nb_team = tuple(tot - com for tot, com in zip(nb_tot, nb_com)) + +fig, ax = plt.subplots() +ax.plot(dates, nb_tot, "b-", label="total") +ax.plot(dates, nb_team, "c-", label="core team") +ax.plot(dates, nb_com, "r-", label="community") +ax.legend(loc="upper left") +ax.grid(True) +ax.set_xlabel("date") +ax.set_ylabel("number of open PRs (sliding average over a week)") +fig.suptitle("Number of PRs pending over time (less is better)") +fig.set_size_inches(12.8, 7.2) # default 100 dpi -> 720p +fig.savefig("prs-pending.png") + +print("date,pending total, pending community") +for d in dates: + tot, com = cnt_tot[d], cnt_com[d] + print("{},{},{}".format(d, tot, com)) diff --git a/pr-metrics/prs.py b/pr-metrics/prs.py new file mode 100644 index 000000000..a436d7e27 --- /dev/null +++ b/pr-metrics/prs.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +"""PR data an misc common functions.""" + +import pickle +import datetime +import os + +with open("pr-data.p", "rb") as f: + prs = pickle.load(f) + + +# Current and past core contributors, alphabetical order (sort -f). +# +# That is, people who are or have been in one of: +# - https://github.com/orgs/Mbed-TLS/teams/mbed-tls-reviewers/members +# - https://github.com/orgs/Mbed-TLS/teams/mbed-tls-developers/members +# The list is maintained manually in order to retain past members. +_team_logins = ( + "adeaarm", + "aditya-deshpande-arm", + "andresag01", + "AndrzejKurek", + "artokin", + "bensze01", + "brett-warren-arm", + "chris-jones-arm", + "d3zd3z", + "danh-arm", + "daverodgman", + "davidhorstmann-arm", + "dgreen-arm", + "gabor-mezei-arm", + "gilles-peskine-arm", + "hanno-arm", + "hanno-becker", + "jackbondpreston-arm", + "jarlamsa", + "jarvte", + "JoeSubbiani", + "k-stachowiak", + "laumor01", + "lpy4105", + "lukgni", + "mazimkhan", + "minosgalanakis", + "mpg", + "mprse", + "mstarzyk-mobica", + "Patater", + "paul-elliott-arm", + "piotr-now", + "pjbakker", + "RcColes", + "ronald-cron-arm", + "RonEld", + "sbutcher-arm", + "shanechko", + "silabs-hannes", + "silabs-Kusumit", + "silabs-Saketh", + "superna9999", + "tom-cosgrove-arm", + "tom-daubney-arm", + "tuvshinzayaArm", + "valeriosetti", + "wernerlewis", + "xkqian", + "yanesca", + "yanrayw", + "yuhaoth", + "yutotakano", + "Zaya-dyno", + "zhangsenWang", +) + + +def is_community(pr): + """Return False if the PR is from a team member.""" + if pr.user.login in _team_logins: + return False + return True + + +def quarter(date): + """Return a string decribing this date's quarter, for example 19q3.""" + q = str(date.year % 100) + q += "q" + q += str((date.month + 2) // 3) + return q + + +def pr_dates(): + """Iterate over PRs with open/close dates and community status.""" + for pr in prs: + beg = pr.created_at.date() + end = pr.closed_at.date() if pr.closed_at else None + com = is_community(pr) + yield (beg, end, com) + + +# default start date: 2020-01-01 (when we moved to tf.org) +first = datetime.date(2020, 1, 1) +# default end date: end of the previous quarter +last = datetime.datetime.now().date() +current_q = quarter(last) +while quarter(last) == current_q: + last -= datetime.timedelta(days=1) +# default start/end dates can be overriden from the environment +if "PR_LAST_DATE" in os.environ: + last_str = os.environ["PR_LAST_DATE"] + last = datetime.datetime.strptime(last_str, "%Y-%m-%d").date() +if "PR_FIRST_DATE" in os.environ: + first_str = os.environ["PR_FIRST_DATE"] + first = datetime.datetime.strptime(first_str, "%Y-%m-%d").date() diff --git a/pr-metrics/requirements.txt b/pr-metrics/requirements.txt new file mode 100644 index 000000000..dd1e61571 --- /dev/null +++ b/pr-metrics/requirements.txt @@ -0,0 +1,17 @@ +certifi==2020.6.20 +chardet==3.0.4 +cycler==0.10.0 +Deprecated==1.2.10 +idna==2.10 +kiwisolver==1.2.0 +matplotlib==3.3.2 +numpy==1.19.2 +Pillow==7.2.0 +PyGithub==1.53 +PyJWT==1.7.1 +pyparsing==2.4.7 +python-dateutil==2.8.1 +requests==2.24.0 +six==1.15.0 +urllib3==1.25.10 +wrapt==1.12.1