Skip to content

Commit 295c8a5

Browse files
Update extract_submissions.py
1 parent 0d1e1b0 commit 295c8a5

File tree

1 file changed

+132
-127
lines changed

1 file changed

+132
-127
lines changed
Lines changed: 132 additions & 127 deletions
Original file line numberDiff line numberDiff line change
@@ -1,79 +1,101 @@
11
import os
2-
import re
2+
import csv
33
import random
4-
import requests
5-
from collections import defaultdict
64
from datetime import datetime
5+
from collections import defaultdict
6+
from github import Github
77
from openpyxl import Workbook
88
from openpyxl.styles import Alignment, Font
99
from openpyxl.utils import get_column_letter
1010
from openpyxl.worksheet.datavalidation import DataValidation
1111

12-
# Set your GitHub repo details
13-
REPO = "pytorch-fdn/ambassador-program"
14-
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
15-
HEADERS = {"Authorization": f"Bearer {GITHUB_TOKEN}"}
16-
API_URL = f"https://api.github.com/repos/{REPO}/issues?state=all&labels=closed&per_page=100"
17-
18-
# Output directories
19-
os.makedirs("ambassador/reviewer_sheets_excel", exist_ok=True)
20-
21-
# Helper to extract structured data from the issue body
22-
def extract_submission(issue):
23-
body = issue["body"]
24-
def extract(label): # Flexible line extractor
25-
pattern = rf"\*\*{re.escape(label)}\*\*\s*\n([\s\S]*?)(?:\n\*\*|$)"
26-
match = re.search(pattern, body, re.IGNORECASE)
27-
return match.group(1).strip() if match else ""
28-
29-
return {
30-
"Issue #": str(issue["number"]),
31-
"Nominee Name": extract("Nominee Name"),
32-
"Nominee Email": extract("Nominee Email"),
33-
"GitHub Handle": extract("Nominee's GitHub or GitLab Handle"),
34-
"Organization": extract("Organization / Affiliation"),
35-
"Location": extract("City, State/Province, Country"),
36-
"Nominator Name": extract("Your Name"),
37-
"Nominator Email": extract("Your Email"),
38-
"Contributions": extract("How has the nominee contributed to PyTorch?"),
39-
"Ambassador Pitch": extract("How Would the Nominee Contribute as an Ambassador?"),
40-
"Extra Notes": extract("Any additional details you'd like to share?"),
41-
"Created At": issue["created_at"]
12+
# Load GitHub issues
13+
print("📥 Fetching GitHub issues...")
14+
GITHUB_TOKEN = os.environ["GITHUB_TOKEN"]
15+
GITHUB_REPO = os.environ["GITHUB_REPOSITORY"]
16+
REPO = Github(GITHUB_TOKEN).get_repo(GITHUB_REPO)
17+
18+
issues = REPO.get_issues(state="all", labels=["closed"])
19+
submissions_raw = []
20+
for issue in issues:
21+
if not issue.body or "[Nomination]" not in issue.title:
22+
continue
23+
submission = {
24+
"Issue #": issue.number,
25+
"Nominee Name": "",
26+
"Nominee GitHub": "",
27+
"Nominee Email": "",
28+
"Organization": "",
29+
"Location": "",
30+
"Nominator Name": "",
31+
"Nominator Email": "",
32+
"Nominee Contributions": "",
33+
"Ambassador Pitch": "",
34+
"Additional Info": "",
35+
"Created At": issue.created_at.strftime("%Y-%m-%d %H:%M:%S")
4236
}
4337

44-
# Step 1: Fetch and parse issues
45-
print("📥 Fetching GitHub issues...")
46-
all_issues = []
47-
page = 1
48-
while True:
49-
response = requests.get(f"{API_URL}&page={page}", headers=HEADERS)
50-
data = response.json()
51-
if not data or "message" in data:
52-
break
53-
all_issues.extend(data)
54-
page += 1
55-
56-
submissions_raw = [extract_submission(issue) for issue in all_issues if "Nominee Name" in issue["body"]]
57-
58-
# Step 2: Deduplicate by nominee name, keeping latest
38+
# Extract fields
39+
lines = issue.body.splitlines()
40+
current_key = ""
41+
for line in lines:
42+
if "**Nominee Name**" in line:
43+
current_key = "Nominee Name"
44+
elif "**Nominee Email**" in line:
45+
current_key = "Nominee Email"
46+
elif "**GitHub or GitLab Handle**" in line:
47+
current_key = "Nominee GitHub"
48+
elif "**Organization / Affiliation**" in line:
49+
current_key = "Organization"
50+
elif "**City, State/Province, Country**" in line:
51+
current_key = "Location"
52+
elif "**Your Name**" in line:
53+
current_key = "Nominator Name"
54+
elif "**Your Email (Optional)**" in line:
55+
current_key = "Nominator Email"
56+
elif "**How has the nominee contributed**" in line:
57+
current_key = "Nominee Contributions"
58+
elif "**How Would the Nominee Contribute as an Ambassador?**" in line:
59+
current_key = "Ambassador Pitch"
60+
elif "**Any additional details you'd like to share?**" in line:
61+
current_key = "Additional Info"
62+
elif line.strip() and current_key:
63+
submission[current_key] += line.strip() + "\n"
64+
65+
submissions_raw.append(submission)
66+
67+
# Deduplicate by GitHub handle (latest entry kept)
5968
print("🧹 Deduplicating...")
60-
deduped, duplicates = {}, []
61-
for sub in submissions_raw:
62-
key = sub["Nominee Name"].strip().lower()
63-
dt = datetime.strptime(sub["Created At"], "%Y-%m-%dT%H:%M:%SZ")
64-
if key not in deduped or dt > datetime.strptime(deduped[key]["Created At"], "%Y-%m-%dT%H:%M:%SZ"):
65-
if key in deduped:
66-
duplicates.append(deduped[key])
67-
deduped[key] = sub
69+
seen = {}
70+
duplicates = []
71+
for s in sorted(submissions_raw, key=lambda x: x["Created At"]):
72+
key = s["Nominee GitHub"].strip().lower()
73+
if key in seen:
74+
duplicates.append(s)
6875
else:
69-
duplicates.append(sub)
76+
seen[key] = s
7077

71-
submissions = list(deduped.values())
78+
submissions = list(seen.values())
7279

73-
# Step 3: Reviewer logic
74-
reviewers = [f"Reviewer {i}" for i in range(1, 8)]
80+
# Save deduplicated CSV
81+
os.makedirs("ambassador", exist_ok=True)
82+
csv_path = "ambassador/ambassador_submissions_deduped.csv"
83+
with open(csv_path, "w", newline="", encoding="utf-8") as f:
84+
writer = csv.DictWriter(f, fieldnames=list(submissions[0].keys()))
85+
writer.writeheader()
86+
writer.writerows(submissions)
7587

76-
# Updated rubric including all categories from the latest file
88+
# Save duplicates separately
89+
if duplicates:
90+
dup_wb = Workbook()
91+
ws = dup_wb.active
92+
ws.title = "Duplicates Removed"
93+
ws.append(list(duplicates[0].keys()))
94+
for d in duplicates:
95+
ws.append([d.get(k, "") for k in ws[1]])
96+
dup_wb.save("ambassador/duplicates_removed.xlsx")
97+
98+
# Rubric
7799
rubric = [
78100
("Technical Expertise", "Proficiency with the PyTorch Ecosystem", "Demonstrated knowledge and practical experience with PyTorch, including model building, traininga and deployment?"),
79101
("Technical Expertise", "Proficiency with the PyTorch Ecosystem", "Familiarity with foundation-hosted projects, vLLM, DeepSpeed?"),
@@ -96,28 +118,24 @@ def extract(label): # Flexible line extractor
96118
("Alignment and Values", "Alignment with PyTorch Foundation Values", "Commitment to open source principles, community-first development, and inclusive collaboration?"),
97119
("Alignment and Values", "Alignment with PyTorch Foundation Values", "Advocacy for responsible AI development and ethical machine learning practices?"),
98120
("Motivation and Vision", "Vision", "Clear articulation of why they want to be an Ambassador and what they hope to accomplish?"),
99-
("Motivation and Vision", "Vision", "Proposed goals or initiatives that align with the mission of the PyTorch Foundation?"),
100-
("Additional Bonus Criteria", "Cross-Community Collaboration", "Contributions or bridges to other relevant ecosystems (e.g., HuggingFace?)"),
101-
("Additional Bonus Criteria", "Cross-Community Collaboration", "Integration work across tools or libraries within the AI/ML infrastructure landscape?"),
102-
("Additional Bonus Criteria", "Geographic and Demographic Diversity", "Representation from underrepresented regions or groups to foster inclusivity and global outreach?"),
103-
("Additional Bonus Criteria", "Innovation and Pioneering Work", "Early adoption or novel application of PyTorch or its ecosystem tools in industry, research, or startups?"),
104-
("Credibility", "Community References", "References from other known community members?")
121+
("Motivation and Vision", "Vision", "Proposed goals or initiatives that align with the mission of the PyTorch Foundation?")
105122
]
106123

107-
summary_categories = []
108-
for cat, _, _ in rubric:
109-
if cat not in summary_categories:
110-
summary_categories.append(cat)
124+
summary_categories = list(dict.fromkeys(cat for cat, _, _ in rubric))
125+
reviewers = [f"Reviewer {i}" for i in range(1, 8)]
126+
output_folder = "ambassador/reviewer_sheets_excel"
127+
os.makedirs(output_folder, exist_ok=True)
111128

129+
# Assign reviewers evenly
112130
assignments = []
113131
reviewer_counts = defaultdict(int)
114-
for sub in submissions:
132+
for submission in submissions:
115133
assigned = random.sample(sorted(reviewers, key=lambda r: reviewer_counts[r])[:4], 2)
116-
for r in assigned:
117-
reviewer_counts[r] += 1
118-
assignments.append((sub, r))
134+
for reviewer in assigned:
135+
reviewer_counts[reviewer] += 1
136+
assignments.append((submission, reviewer))
119137

120-
# Step 4: Generate reviewer sheets
138+
# Generate reviewer workbooks
121139
for reviewer in reviewers:
122140
wb = Workbook()
123141
ws = wb.active
@@ -129,91 +147,78 @@ def extract(label): # Flexible line extractor
129147
"Reviewer's Comment", "Category", "Subcategory", "Question", "Score"
130148
]
131149
ws.append(headers)
132-
for c in range(1, len(headers)+1):
133-
ws.cell(row=1, column=c).font = Font(bold=True)
150+
for col in range(1, len(headers)+1):
151+
ws.cell(row=1, column=col).font = Font(bold=True)
134152

135153
dv = DataValidation(type="list", formula1='"Yes,No,N/A"', allow_blank=True)
136154
ws.add_data_validation(dv)
137155

138156
row_idx = 2
139-
ranges = []
157+
candidate_ranges = []
140158

141-
for sub, r in assignments:
142-
if r != reviewer:
159+
for submission, assigned_reviewer in assignments:
160+
if assigned_reviewer != reviewer:
143161
continue
144-
sid = sub["Issue #"]
145-
name_parts = sub["Nominee Name"].split()
146-
fname = name_parts[0]
147-
lname = name_parts[-1] if len(name_parts) > 1 else ""
148-
summary = f"""
149-
GitHub: {sub.get("GitHub Handle", "")}
150-
Org: {sub.get("Organization", "")}
151-
Location: {sub.get("Location", "")}
152162

153-
Contributions:
154-
{sub.get("Contributions", "")}
163+
sid = submission["Issue #"]
164+
name = submission["Nominee Name"].split()
165+
fname = name[0]
166+
lname = name[-1] if len(name) > 1 else ""
155167

156-
Ambassador Pitch:
157-
{sub.get("Ambassador Pitch", "")}
168+
# Submission Summary includes all fields except first 3
169+
summary = f"""GitHub: {submission.get("Nominee GitHub", "")}
170+
Email: {submission.get("Nominee Email", "")}
171+
Organization: {submission.get("Organization", "")}
172+
Location: {submission.get("Location", "")}
173+
Nominator: {submission.get("Nominator Name", "")}
174+
Nominator Email: {submission.get("Nominator Email", "")}
158175
159-
Additional Info:
160-
{sub.get("Extra Notes", "")}
161-
""".strip()
176+
Contributions:\n{submission.get("Nominee Contributions", "")}
177+
Ambassador Pitch:\n{submission.get("Ambassador Pitch", "")}
178+
Additional Info:\n{submission.get("Additional Info", "")}"""
162179

163180
start = row_idx
164181
for cat, subcat, question in rubric:
165182
ws.append([sid, fname, lname, summary, "", cat, subcat, question, ""])
166183
row_idx += 1
167184
end = row_idx - 1
168-
ranges.append((sid, fname, lname, start, end))
185+
candidate_ranges.append((sid, fname, lname, start, end))
169186

170-
for col in [1, 2, 3, 4, 5]: # Merge key fields
187+
for col in [1, 2, 3, 4, 5]: # Merge ID, First, Last, Summary, Reviewer Comment
171188
ws.merge_cells(start_row=start, end_row=end, start_column=col, end_column=col)
172-
ws.cell(row=start, column=col).alignment = Alignment(vertical="top", wrap_text=True)
173-
for r in range(start, end+1):
189+
cell = ws.cell(row=start, column=col)
190+
cell.alignment = Alignment(vertical="top", wrap_text=True)
191+
192+
for r in range(start, end + 1):
174193
dv.add(ws[f"I{r}"])
175194

176-
# Autofit columns
177195
for col in ws.columns:
178-
max_len = max((len(str(c.value)) if c.value else 0) for c in col)
179-
ws.column_dimensions[get_column_letter(col[0].column)].width = min(max_len + 5, 60)
196+
max_len = max((len(str(cell.value)) if cell.value else 0) for cell in col)
197+
ws.column_dimensions[get_column_letter(col[0].column)].width = min(max_len + 5, 50)
180198

181-
# Score Summary
182199
summary_ws.append(["Submission ID", "First Name", "Last Name"] + summary_categories + ["Final Score"])
183200
for col in range(1, summary_ws.max_column + 1):
184201
summary_ws.cell(row=1, column=col).font = Font(bold=True)
185202

186-
for sid, fname, lname, start, end in ranges:
187-
cat_rows = defaultdict(list)
203+
for sid, fname, lname, start, end in candidate_ranges:
204+
category_rows = defaultdict(list)
188205
for r in range(start, end + 1):
189206
cat = ws.cell(row=r, column=6).value
190-
cat_rows[cat].append(r)
207+
category_rows[cat].append(r)
191208

192209
formulas = []
193210
for cat in summary_categories:
194-
if cat in cat_rows:
195-
rows = cat_rows[cat]
211+
if cat in category_rows:
212+
rows = category_rows[cat]
196213
formulas.append(f'=SUMPRODUCT(--(\'Review Sheet\'!I{rows[0]}:I{rows[-1]}="Yes"))')
197214
else:
198215
formulas.append("0")
199-
row_number = summary_ws.max_row + 1
200-
final_formula = f"=SUM({','.join([f'{get_column_letter(i+4)}{row_number}' for i in range(len(formulas))])})"
201-
summary_ws.append([sid, fname, lname] + formulas + [final_formula])
202-
203-
wb.save(f"ambassador/reviewer_sheets_excel/{reviewer.replace(' ', '_').lower()}_sheet.xlsx")
204-
205-
# Step 5: Save duplicates separately
206-
dup_wb = Workbook()
207-
ws = dup_wb.active
208-
ws.title = "Duplicates Removed"
209216

210-
if duplicates:
211-
ws.append(list(duplicates[0].keys()))
212-
for d in duplicates:
213-
ws.append([d.get(k, "") for k in ws[1]])
214-
else:
215-
ws.append(["No duplicates found"])
217+
row_number = summary_ws.max_row + 1
218+
total_formula = f"=SUM({','.join([f'{get_column_letter(i+4)}{row_number}' for i in range(len(formulas))])})"
219+
summary_ws.append([sid, fname, lname] + formulas + [total_formula])
216220

217-
dup_wb.save("ambassador/duplicates_removed.xlsx")
221+
filename = os.path.join(output_folder, f"{reviewer.replace(' ', '_').lower()}_sheet.xlsx")
222+
wb.save(filename)
218223

219-
print("✅ All reviewer sheets and duplicates file generated.")
224+
print("✅ All reviewer sheets generated successfully.")

0 commit comments

Comments
 (0)