Skip to content

Commit 8c1939b

Browse files
Update extract_submissions.py
1 parent ef2a38d commit 8c1939b

File tree

1 file changed

+115
-55
lines changed

1 file changed

+115
-55
lines changed
Lines changed: 115 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,76 @@
11
import os
2-
import csv
2+
import re
33
import random
4+
import requests
45
from collections import defaultdict
6+
from datetime import datetime
57
from openpyxl import Workbook
68
from openpyxl.styles import Alignment, Font
79
from openpyxl.utils import get_column_letter
810
from openpyxl.worksheet.datavalidation import DataValidation
911

10-
# Load deduplicated submissions
11-
with open("ambassador/ambassador_submissions_deduped.csv", newline='', encoding='utf-8') as f:
12-
reader = csv.DictReader(f)
13-
submissions = list(reader)
14-
15-
# Define reviewers
12+
# Set your GitHub repo details
13+
REPO = "pytorch-fdn/ambassador-program"
14+
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
15+
HEADERS = {"Authorization": f"Bearer {GITHUB_TOKEN}"}
16+
API_URL = f"https://api.github.com/repos/{REPO}/issues?state=all&labels=closed&per_page=100"
17+
18+
# Output directories
19+
os.makedirs("ambassador/reviewer_sheets_excel", exist_ok=True)
20+
21+
# Helper to extract structured data from the issue body
22+
def extract_submission(issue):
23+
body = issue["body"]
24+
def extract(label): # Flexible line extractor
25+
pattern = rf"\*\*{re.escape(label)}\*\*\s*\n([\s\S]*?)(?:\n\*\*|$)"
26+
match = re.search(pattern, body, re.IGNORECASE)
27+
return match.group(1).strip() if match else ""
28+
29+
return {
30+
"Issue #": str(issue["number"]),
31+
"Nominee Name": extract("Nominee Name"),
32+
"Nominee Email": extract("Nominee Email"),
33+
"GitHub Handle": extract("Nominee's GitHub or GitLab Handle"),
34+
"Organization": extract("Organization / Affiliation"),
35+
"Location": extract("City, State/Province, Country"),
36+
"Nominator Name": extract("Your Name"),
37+
"Nominator Email": extract("Your Email"),
38+
"Contributions": extract("How has the nominee contributed to PyTorch?"),
39+
"Ambassador Pitch": extract("How Would the Nominee Contribute as an Ambassador?"),
40+
"Extra Notes": extract("Any additional details you'd like to share?"),
41+
"Created At": issue["created_at"]
42+
}
43+
44+
# Step 1: Fetch and parse issues
45+
print("📥 Fetching GitHub issues...")
46+
all_issues = []
47+
page = 1
48+
while True:
49+
response = requests.get(f"{API_URL}&page={page}", headers=HEADERS)
50+
data = response.json()
51+
if not data or "message" in data:
52+
break
53+
all_issues.extend(data)
54+
page += 1
55+
56+
submissions_raw = [extract_submission(issue) for issue in all_issues if "Nominee Name" in issue["body"]]
57+
58+
# Step 2: Deduplicate by nominee name, keeping latest
59+
print("🧹 Deduplicating...")
60+
deduped, duplicates = {}, []
61+
for sub in submissions_raw:
62+
key = sub["Nominee Name"].strip().lower()
63+
dt = datetime.strptime(sub["Created At"], "%Y-%m-%dT%H:%M:%SZ")
64+
if key not in deduped or dt > datetime.strptime(deduped[key]["Created At"], "%Y-%m-%dT%H:%M:%SZ"):
65+
if key in deduped:
66+
duplicates.append(deduped[key])
67+
deduped[key] = sub
68+
else:
69+
duplicates.append(sub)
70+
71+
submissions = list(deduped.values())
72+
73+
# Step 3: Reviewer logic
1674
reviewers = [f"Reviewer {i}" for i in range(1, 8)]
1775

1876
# Updated rubric including all categories from the latest file
@@ -46,109 +104,111 @@
46104
("Credibility", "Community References", "References from other known community members?")
47105
]
48106

49-
# Dynamically detect unique rubric categories in order
50107
summary_categories = []
51108
for cat, _, _ in rubric:
52109
if cat not in summary_categories:
53110
summary_categories.append(cat)
54111

55-
# Output directory
56-
output_folder = "ambassador/reviewer_sheets_excel"
57-
os.makedirs(output_folder, exist_ok=True)
58-
59-
# Assign reviewers evenly
60112
assignments = []
61113
reviewer_counts = defaultdict(int)
62-
for submission in submissions:
114+
for sub in submissions:
63115
assigned = random.sample(sorted(reviewers, key=lambda r: reviewer_counts[r])[:4], 2)
64-
for reviewer in assigned:
65-
reviewer_counts[reviewer] += 1
66-
assignments.append((submission, reviewer))
116+
for r in assigned:
117+
reviewer_counts[r] += 1
118+
assignments.append((sub, r))
67119

68-
# Generate Excel files per reviewer
120+
# Step 4: Generate reviewer sheets
69121
for reviewer in reviewers:
70122
wb = Workbook()
71123
ws = wb.active
72124
ws.title = "Review Sheet"
73125
summary_ws = wb.create_sheet("Score Summary")
74126

75-
# Review Sheet headers
76127
headers = [
77128
"Submission ID", "First Name", "Last Name", "Submission Summary",
78129
"Reviewer's Comment", "Category", "Subcategory", "Question", "Score"
79130
]
80131
ws.append(headers)
81-
for col in range(1, len(headers)+1):
82-
ws.cell(row=1, column=col).font = Font(bold=True)
132+
for c in range(1, len(headers)+1):
133+
ws.cell(row=1, column=c).font = Font(bold=True)
83134

84-
# Add dropdown
85135
dv = DataValidation(type="list", formula1='"Yes,No,N/A"', allow_blank=True)
86136
ws.add_data_validation(dv)
87137

88138
row_idx = 2
89-
candidate_ranges = []
139+
ranges = []
90140

91-
for submission, assigned_reviewer in assignments:
92-
if assigned_reviewer != reviewer:
141+
for sub, r in assignments:
142+
if r != reviewer:
93143
continue
144+
sid = sub["Issue #"]
145+
name_parts = sub["Nominee Name"].split()
146+
fname = name_parts[0]
147+
lname = name_parts[-1] if len(name_parts) > 1 else ""
148+
summary = f"""
149+
GitHub: {sub.get("GitHub Handle", "")}
150+
Org: {sub.get("Organization", "")}
151+
Location: {sub.get("Location", "")}
94152
95-
sid = submission["Issue #"]
96-
name = submission["Nominee Name"].split()
97-
fname = name[0]
98-
lname = name[-1] if len(name) > 1 else ""
99-
summary = f"""Contributions:\n{submission.get("Contributions", "")}
153+
Contributions:
154+
{sub.get("Contributions", "")}
100155
101-
Ambassador Pitch:\n{submission.get("Ambassador Pitch", "")}
156+
Ambassador Pitch:
157+
{sub.get("Ambassador Pitch", "")}
102158
103-
Additional Notes:\n{submission.get("Extra Notes", "")}"""
159+
Additional Info:
160+
{sub.get("Extra Notes", "")}
161+
""".strip()
104162

105163
start = row_idx
106164
for cat, subcat, question in rubric:
107165
ws.append([sid, fname, lname, summary, "", cat, subcat, question, ""])
108166
row_idx += 1
109167
end = row_idx - 1
110-
candidate_ranges.append((sid, fname, lname, start, end))
168+
ranges.append((sid, fname, lname, start, end))
111169

112-
# Merge ID/name cells
113-
for col in [1, 2, 3, 4]:
170+
for col in [1, 2, 3, 4, 5]: # Merge key fields
114171
ws.merge_cells(start_row=start, end_row=end, start_column=col, end_column=col)
115-
cell = ws.cell(row=start, column=col)
116-
cell.alignment = Alignment(vertical="top", wrap_text=True)
117-
118-
for r in range(start, end + 1):
172+
ws.cell(row=start, column=col).alignment = Alignment(vertical="top", wrap_text=True)
173+
for r in range(start, end+1):
119174
dv.add(ws[f"I{r}"])
120175

121176
# Autofit columns
122177
for col in ws.columns:
123-
max_len = max((len(str(cell.value)) if cell.value else 0) for cell in col)
124-
ws.column_dimensions[get_column_letter(col[0].column)].width = min(max_len + 5, 50)
178+
max_len = max((len(str(c.value)) if c.value else 0) for c in col)
179+
ws.column_dimensions[get_column_letter(col[0].column)].width = min(max_len + 5, 60)
125180

126-
# Score Summary header
181+
# Score Summary
127182
summary_ws.append(["Submission ID", "First Name", "Last Name"] + summary_categories + ["Final Score"])
128183
for col in range(1, summary_ws.max_column + 1):
129184
summary_ws.cell(row=1, column=col).font = Font(bold=True)
130185

131-
# Fill score summary
132-
for sid, fname, lname, start, end in candidate_ranges:
133-
category_rows = defaultdict(list)
186+
for sid, fname, lname, start, end in ranges:
187+
cat_rows = defaultdict(list)
134188
for r in range(start, end + 1):
135189
cat = ws.cell(row=r, column=6).value
136-
category_rows[cat].append(r)
190+
cat_rows[cat].append(r)
137191

138192
formulas = []
139193
for cat in summary_categories:
140-
if cat in category_rows:
141-
rows = category_rows[cat]
194+
if cat in cat_rows:
195+
rows = cat_rows[cat]
142196
formulas.append(f'=SUMPRODUCT(--(\'Review Sheet\'!I{rows[0]}:I{rows[-1]}="Yes"))')
143197
else:
144198
formulas.append("0")
145-
146199
row_number = summary_ws.max_row + 1
147-
total_formula = f"=SUM({','.join([f'{get_column_letter(i+4)}{row_number}' for i in range(len(formulas))])})"
148-
summary_ws.append([sid, fname, lname] + formulas + [total_formula])
200+
final_formula = f"=SUM({','.join([f'{get_column_letter(i+4)}{row_number}' for i in range(len(formulas))])})"
201+
summary_ws.append([sid, fname, lname] + formulas + [final_formula])
202+
203+
wb.save(f"ambassador/reviewer_sheets_excel/{reviewer.replace(' ', '_').lower()}_sheet.xlsx")
149204

150-
# Save
151-
filename = os.path.join(output_folder, f"{reviewer.replace(' ', '_').lower()}_sheet.xlsx")
152-
wb.save(filename)
205+
# Step 5: Save duplicates separately
206+
dup_wb = Workbook()
207+
ws = dup_wb.active
208+
ws.title = "Duplicates Removed"
209+
ws.append(list(duplicates[0].keys()))
210+
for d in duplicates:
211+
ws.append([d.get(k, "") for k in ws[1]])
212+
dup_wb.save("ambassador/duplicates_removed.xlsx")
153213

154-
print("✅ Reviewer sheets generated with updated rubric and corrected score summary.")
214+
print("✅ All reviewer sheets and duplicates file generated.")

0 commit comments

Comments
 (0)