diff --git a/2023/programme/01_generate_spreadsheet_from_pretalx_data/.gitignore b/2023/programme/01_generate_spreadsheet_from_pretalx_data/.gitignore new file mode 100644 index 0000000..09f4f9e --- /dev/null +++ b/2023/programme/01_generate_spreadsheet_from_pretalx_data/.gitignore @@ -0,0 +1,4 @@ +config.toml +env/ +*.pyc +data/ diff --git a/2023/programme/01_generate_spreadsheet_from_pretalx_data/.python-version b/2023/programme/01_generate_spreadsheet_from_pretalx_data/.python-version new file mode 100644 index 0000000..2c07333 --- /dev/null +++ b/2023/programme/01_generate_spreadsheet_from_pretalx_data/.python-version @@ -0,0 +1 @@ +3.11 diff --git a/2023/programme/01_generate_spreadsheet_from_pretalx_data/01_download_data_from_pretalx.py b/2023/programme/01_generate_spreadsheet_from_pretalx_data/01_download_data_from_pretalx.py new file mode 100644 index 0000000..867b27f --- /dev/null +++ b/2023/programme/01_generate_spreadsheet_from_pretalx_data/01_download_data_from_pretalx.py @@ -0,0 +1,48 @@ +import json +from datetime import datetime +from pathlib import Path +from pprint import pprint as pp + +import requests +from config import conf + +class Config: + token = conf["pretalx-token"] + event = "europython-2023" + +headers = { + "Accept": "application/json, text/javascript", + "Authorization": f"Token {Config.token}", +} + +base_url = f"https://pretalx.com/api/events/{Config.event}/" + +resources = [ + "submissions", + "reviews", + "speakers", + "answers", +] + +for resource in resources: + print("h1: ", resource) + url = base_url + f"{resource}" + + res0 = [] + data = {"next": url} + n = 0 + while url := data["next"]: + n += 1 + print(f"Page {n}") + response = requests.get(url, headers=headers) + data = response.json() + res0 += data["results"] + + timestamp = datetime.now().strftime("%Y%m%d_%H%M") + fnames = [ + f"data/{Config.event}/{resource}_{timestamp}.json", + f"data/{Config.event}/{resource}_latest.json", + ] + for fname in fnames: + with open(fname, "w") as fd: + json.dump(res0, fd) diff --git a/2023/programme/01_generate_spreadsheet_from_pretalx_data/02_parse_and_merge.py b/2023/programme/01_generate_spreadsheet_from_pretalx_data/02_parse_and_merge.py new file mode 100644 index 0000000..c71f91b --- /dev/null +++ b/2023/programme/01_generate_spreadsheet_from_pretalx_data/02_parse_and_merge.py @@ -0,0 +1,386 @@ +""" +Parse, merge and combine data from speakers, answers, submissions to generate +the base dataframe that we can use later to merge reviews and community voting +results. +""" + +import json +from collections import Counter +from datetime import datetime as dt + +import numpy as np +import pandas as pd + +pd.options.display.max_rows = 50 +pd.set_option("display.max_columns", 500) +pd.set_option("display.width", 1000) + +sub = pd.read_json("./data/europython-2023/submissions_latest.json") +rev = pd.read_json("./data/europython-2023/reviews_latest.json") +ans = pd.read_json("./data/europython-2023/answers_latest.json") +spk = pd.read_json("./data/europython-2023/speakers_latest.json") +# vot = pd.read_csv("./data/europython-2023/votes-latest.csv") + + +# First simplify the submissions +# We can drop the speakers from the original table because we will be later +# joining with the speakers table anyway. +# We need a separate join with speakers, because we do separate join with +# answers - some answers are per submission and some are per speaker. + +sub["type"] = sub["submission_type"].map(lambda x: x.get("en", "").strip()) +sub["track"] = sub["track"].map(lambda x: isinstance(x, dict) and x.get("en", "")) +del sub["submission_type"] + +# skip withdrawn talks while we're at it. +sub = sub[sub["state"] == "submitted"] + + +# clean up answers as well +ans["q_id"] = ans["question"].map(lambda x: x.get("id", "")) +ans["q"] = ans["question"].map(lambda x: x.get("question", {}).get("en", "")) + +# This is a helper for a temporary groupping of questions and answers that we +# use for both submissions and speaker answers. +# The details don't matter that much because we later convert those answers to +# separate columns + + +def _qa_dict(row): + q_id_list = row["q_id"] + q_list = row["q"] + a_list = row["answer"] + out = [] + for i, q, a in zip(q_id_list, q_list, a_list): + out.append({"q_id": i, "q": q, "a": a}) + + return out + + +def _get_answer(answers, _id): + """ + Helper to extract answers to specific questions, given by question ID from pretalx + """ + q = [x.get("a") for x in answers if x.get("q_id") == _id] + return ", ".join(q) + + +# Big Picture +# =========== +# 1. We start with prepping the speakers table, and we merge it with answers. +# 2. Then we merge answers and clean up submissions. +# 3. Then we merge new cleaned up speakers table with new cleaned up +# submissions table. +# +# This gives us basic table with proposals that we can merge with voting and +# reviews data. + +spk = spk.rename( + columns={"code": "speaker_code", "name": "speaker_name", "biography": "bio"} +) + +speakers1 = pd.merge( + spk, + ans[["q_id", "q", "person", "submission", "answer"]], + left_on="speaker_code", + right_on="person", + how="left", +) + +def implode(df, keep: list[str], agg: list[str]): + assert isinstance(keep, list) + assert isinstance(agg, list) + + df = df.groupby(keep, group_keys=True, dropna=False).aggregate( + {col: lambda x: x.tolist() for col in agg} + ) + df.reset_index(inplace=True) + df = df.loc[:, keep + agg] + + df["Q"] = df.apply(lambda row: _qa_dict(row), axis=1) + df = df.drop(["q_id", "q", "answer"], axis=1) + return df + + +# Temporary copy of submissions +temp = speakers1[["speaker_code", "submissions"]] +temp = temp.drop_duplicates("speaker_code") +temp = temp.set_index("speaker_code") + +# Implode the version without submissions +# Submissions will be added later after the questions are imploded, so that we +# can explode it without extra duplicates. +speakers1 = speakers1.drop("submissions", axis=1) + + +speakers2 = implode( + speakers1, + keep=[ + "speaker_code", + "speaker_name", + "email", + "bio", + ], + agg=[ + "q_id", + "q", + "answer", + ], +) + +print(speakers2) + + +qa = { + "company": 2346, + "country": 2347, + "job": 2348, + "homepage": 239, + "github": 2352, + "linkedin": 2353, + "underrep": 2354, + "underrep_details": 2355, + "underrep_other": 2356, + "finaid": 2357, + "under18": 2358, +} + + +for col, key in qa.items(): + speakers2[col] = speakers2["Q"].map(lambda x: _get_answer(x, key)) + +speakers2 = speakers2.drop("Q", axis=1) +speakers2 = speakers2.set_index("speaker_code") +speakers2["subs"] = temp["submissions"] +speakers3 = speakers2.explode("subs") + + +# =========================== +# 2. CLEAN THE QA on SUBS +# =========================== + +with_answers_1 = pd.merge( + sub, + ans[["q_id", "q", "person", "submission", "answer"]], + how='left', + left_on="code", + right_on="submission", +) + +with_answers_2 = implode( + with_answers_1, + keep=[ + "code", + "title", + "type", + "track", + "duration", + "state", + "abstract", + "description", + ], + agg=[ + "q_id", + "q", + "answer", + ], +) + + +qa = { + "length": 2360, + "onsite": 2362, + "exp": 2363, + "tweet": 2365, + "for_reviewers": 2366, + "material": 2367, +} + +for col, key in qa.items(): + with_answers_2[col] = with_answers_2["Q"].map(lambda x: _get_answer(x, key)) + +with_answers_2 = with_answers_2.drop("Q", axis=1) +# with_answers_2 = with_answers_2.set_index("code") + + +# ================================== +# 3. MERGE SUBMISSIONS WITH SPEAKERS +# =================================== + +df = pd.merge( + with_answers_2, + speakers3, + how='left', + left_on="code", + right_on="subs", +) + +# NOW ON TO THE VOTING AND REVIEWS DATA. + +# Reviews is easy - we already have the data, we just need to load and merge it +# again. +# For voting we will need to get the export from the voting app and then + + +# ===================== +# 4. MERGE REVIEWS +# ===================== + +# Backup the columns, you will need them later for the aggregation. +cols = df.columns + + +gr = rev.groupby("submission").agg( + Rsum=pd.NamedAgg(column="score", aggfunc="sum"), + Ravg=pd.NamedAgg(column="score", aggfunc="mean"), + Rmed=pd.NamedAgg(column="score", aggfunc="median"), + Rcnt=pd.NamedAgg(column="score", aggfunc="count"), +) + +df = df.set_index("subs") + +withrevs = df.join(gr, how="left") + + +# ================================================================== +# 5. groupped scores are done, now we need to show individual scores. +# For every review we need to create a column +# =================================================================== + + +reviewers_to_initials = { + x: "".join([y[0] for y in x.split()]).upper() for x in sorted(rev["user"].unique()) +} +cc = Counter(reviewers_to_initials.values()) + +for k, v in reviewers_to_initials.items(): + # If there are dupplicates just add numbers + if cc[v] > 1: + reviewers_to_initials[k] = f"{v}{cc[v]}" + cc[v] -= 1 + + +rev["initials"] = rev["user"].map(reviewers_to_initials) +rev.head(2) + +reviews_p = rev.pivot(columns="initials", values=["score"], index="submission").fillna("") +reviews_p.columns = reviews_p.columns.to_flat_index() + +main = withrevs.join(reviews_p, how="left") + +# ===================================================== +# 6. YAY. ONE LAST FINAL PIECE. COMMUNITY VOTING +# ===================================================== + +vot = vot.set_index("code") +main = main.join(vot, how="left") + + +print(main.columns) +IDX = [ + "type", + "track", + "title", + "speaker_name", + # "duration", + # "state", + # "abstract", + # "description", + "length", + "onsite", + "exp", + "tweet", + "for_reviewers", + "material", + "email", + "bio", + "company", + "country", + "job", + # "homepage", + # "github", + # "linkedin", + "underrep", + "underrep_details", + "underrep_other", + "finaid", + "under18", + # Review bulk metrics + "Rsum", + "Ravg", + "Rmed", + "Rcnt", + # Community voting + "score", + "must", + "want", + "maybe", + "votes", + "comments", + # Individual review scores + ("score", "A"), + ("score", "AC"), + ("score", "AH"), + ("score", "AM"), + ("score", "AWT"), + ("score", "CB"), + ("score", "CL"), + ("score", "CM"), + ("score", "CM2"), + ("score", "DL"), + ("score", "DS"), + ("score", "DV"), + ("score", "EA"), + ("score", "JW"), + ("score", "LC"), + ("score", "NR"), + ("score", "NT"), + ("score", "PJ"), + ("score", "RD"), + ("score", "RD2"), + ("score", "RK"), + ("score", "RP"), + ("score", "SG"), + ("score", "SS"), + ("score", "SW"), + ("score", "SZ"), + ("score", "TM"), + ("score", "VGF"), + ("score", "VM"), +] + +main = main[IDX] + +skip_score = {x: x[1] for x in main.columns if x[0] == "score"} +main = main.rename(columns=skip_score) + +# # Clean up remaining few things +main = main.rename(columns={ + "length": "_L", +}) +# This is a question so can't be cleaned up early at the beginning, only after +# merging with answers and imploding +main["onsite"] = main["onsite"].map(lambda x: "REMOTE" if "remote" in x else "YES") + +# Convert the talk ID to a clickable link that goes to the reviews +# First we drop the current index (which is a pretalx ID), the new column would +# be called "index" – we can keep the name, it's good enough. +# Then we add hyperlinks to this new column before exporting. +main = main.reset_index() + +# Then as a final step apply the link +def link(x): + event_name = "europython-2023" + url = f"https://pretalx.com/orga/event/{event_name}/submissions/{x}/reviews/" + return f'=HYPERLINK("{url}", "{x}")' + + +main["index"] = main["index"].map(link) + + +# And finally export to an excel file +tstamp = dt.now().strftime("%Y%m%d-%H%M") +fname = f"./main-filtered-out-missing-entries-{tstamp}.xlsx" +main.to_excel(fname, index=False) + +print(f"FINISHED AND SAVED at {fname}") diff --git a/2023/programme/01_generate_spreadsheet_from_pretalx_data/Makefile b/2023/programme/01_generate_spreadsheet_from_pretalx_data/Makefile new file mode 100644 index 0000000..aa641a8 --- /dev/null +++ b/2023/programme/01_generate_spreadsheet_from_pretalx_data/Makefile @@ -0,0 +1,11 @@ +deps/init: + pip install pip-tools + +deps/compile: + pip-compile + +deps/install: + pip-sync + +run: + python ./01_download_data_from_pretalx.py diff --git a/2023/programme/01_generate_spreadsheet_from_pretalx_data/config.py b/2023/programme/01_generate_spreadsheet_from_pretalx_data/config.py new file mode 100644 index 0000000..ad990b3 --- /dev/null +++ b/2023/programme/01_generate_spreadsheet_from_pretalx_data/config.py @@ -0,0 +1,4 @@ +import tomllib + +with open("config.toml", "rb") as fd: + conf = tomllib.load(fd) diff --git a/2023/programme/01_generate_spreadsheet_from_pretalx_data/config.toml.example b/2023/programme/01_generate_spreadsheet_from_pretalx_data/config.toml.example new file mode 100644 index 0000000..a4a2df4 --- /dev/null +++ b/2023/programme/01_generate_spreadsheet_from_pretalx_data/config.toml.example @@ -0,0 +1 @@ +pretalx-token = "this is pretalx token - for submissions" diff --git a/2023/programme/01_generate_spreadsheet_from_pretalx_data/requirements.in b/2023/programme/01_generate_spreadsheet_from_pretalx_data/requirements.in new file mode 100644 index 0000000..65a42be --- /dev/null +++ b/2023/programme/01_generate_spreadsheet_from_pretalx_data/requirements.in @@ -0,0 +1,2 @@ +requests +pandas diff --git a/2023/programme/01_generate_spreadsheet_from_pretalx_data/requirements.txt b/2023/programme/01_generate_spreadsheet_from_pretalx_data/requirements.txt new file mode 100644 index 0000000..8a8ea6d --- /dev/null +++ b/2023/programme/01_generate_spreadsheet_from_pretalx_data/requirements.txt @@ -0,0 +1,28 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile +# +certifi==2024.2.2 + # via requests +charset-normalizer==3.3.2 + # via requests +idna==3.6 + # via requests +numpy==1.26.4 + # via pandas +pandas==2.2.1 + # via -r requirements.in +python-dateutil==2.9.0.post0 + # via pandas +pytz==2024.1 + # via pandas +requests==2.31.0 + # via -r requirements.in +six==1.16.0 + # via python-dateutil +tzdata==2024.1 + # via pandas +urllib3==2.2.1 + # via requests