Skip to content

Commit 1a7bc0f

Browse files
Update summarize_applications.py
1 parent 510b1fc commit 1a7bc0f

File tree

1 file changed

+42
-59
lines changed

1 file changed

+42
-59
lines changed

.github/scripts/summarize_applications.py

Lines changed: 42 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -4,97 +4,80 @@
44
from github import Github
55
from openpyxl import Workbook
66

7-
# Load GitHub access credentials
7+
# Get GitHub token and repository name
88
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
99
GITHUB_REPOSITORY = os.getenv("GITHUB_REPOSITORY")
1010

11-
# Authenticate with GitHub
11+
# Authenticate
1212
g = Github(GITHUB_TOKEN)
1313
repo = g.get_repo(GITHUB_REPOSITORY)
1414

1515
print("📥 Fetching open GitHub issues with 'ambassador' label...")
16-
issues = repo.get_issues(state='open', labels=['ambassador'])
16+
issues = list(repo.get_issues(state='open', labels=['ambassador']))
17+
print(f"✅ Total submissions found: {len(issues)}")
1718

18-
submissions = []
19-
20-
# Helper to extract plain-text responses
19+
# Helper to extract text fields
2120
def extract(label, body):
22-
match = re.search(rf"{label}\s*\n\s*(.+)", body)
21+
match = re.search(rf"{label}\s*\n+(.+?)(\n\S|\Z)", body, re.DOTALL)
2322
return match.group(1).strip() if match else ""
2423

25-
# Helper to extract checkbox options
24+
# Helper to extract all checkbox lines
2625
def extract_checkboxes(body):
27-
checkbox_section = re.findall(r"How has the nominee contributed to PyTorch\?\s*\n((?:- \[.\] .+\n?)+)", body)
28-
if not checkbox_section:
29-
return []
30-
return checkbox_section[0].strip().splitlines()
26+
matches = re.findall(r"- \[x\] (.+)", body, flags=re.IGNORECASE)
27+
return "; ".join(matches) if matches else ""
3128

32-
# Process each issue
29+
# Build submissions list
30+
submissions = []
3331
for issue in issues:
3432
body = issue.body or ""
3533

36-
name = extract("Nominee Name", body)
37-
email = extract("Nominee Email", body)
38-
github_handle = extract("Nominee's GitHub or GitLab Handle", body)
39-
ambassador_plan = extract("🏆 How Would the Nominee Contribute as an Ambassador?", body)
40-
additional_info = extract("Any additional details you'd like to share?", body)
41-
contributions = extract_checkboxes(body)
42-
43-
# Format submission summary
44-
submission_summary = f"""**GitHub Handle:** {github_handle or 'Not Provided'}
45-
46-
**How Has the Nominee Contributed to PyTorch?**
47-
{chr(10).join(contributions) if contributions else 'Not Provided'}
48-
49-
**Ambassador Contribution Plan**
50-
{ambassador_plan or 'Not Provided'}
51-
52-
**Additional Information**
53-
{additional_info or 'Not Provided'}
54-
"""
55-
56-
submissions.append({
34+
entry = {
5735
"Issue #": issue.number,
58-
"Nominee Name": name,
59-
"Nominee Email": email,
60-
"Submission Summary": submission_summary.strip()
61-
})
62-
63-
print(f"✅ Total submissions found: {len(submissions)}")
64-
65-
# Deduplicate by email (fallback to name)
66-
latest_by_email = {}
36+
"Nominee Name": extract("Nominee Name", body),
37+
"Nominee Email": extract("Nominee Email", body),
38+
"GitHub Handle": extract("Nominee's GitHub or GitLab Handle", body),
39+
"Submission Summary": (
40+
f"🏆 Ambassador Contribution Plan:\n{extract('🏆 How Would the Nominee Contribute as an Ambassador?', body)}\n\n"
41+
f"🔗 Additional Information:\n{extract('Any additional details you\\'d like to share?', body)}\n\n"
42+
f"Contribution Highlights:\n{extract_checkboxes(body)}"
43+
)
44+
}
45+
submissions.append(entry)
46+
47+
# Deduplication logic: prefer latest submission by email or name
48+
latest_submissions = {}
6749
for entry in sorted(submissions, key=lambda x: x["Issue #"], reverse=True):
68-
key = (entry["Nominee Email"] or entry["Nominee Name"]).lower()
69-
if key not in latest_by_email:
70-
latest_by_email[key] = entry
50+
key = entry["Nominee Email"].lower() if entry["Nominee Email"] else entry["Nominee Name"].lower()
51+
if key not in latest_submissions:
52+
latest_submissions[key] = entry
7153

72-
deduped = list(latest_by_email.values())
73-
duplicates = [entry for entry in submissions if entry not in deduped]
54+
deduped = list(latest_submissions.values())
55+
duplicates = [s for s in submissions if s not in deduped]
7456

75-
# Ensure output directory
57+
# Ensure output folder
7658
os.makedirs("ambassador", exist_ok=True)
7759

78-
# Save all submissions
79-
with open("ambassador/submissions_all.csv", "w", newline='', encoding="utf-8") as f:
60+
# Write full submission CSV
61+
with open("ambassador/submissions_all_raw.csv", "w", newline='', encoding="utf-8") as f:
8062
writer = csv.DictWriter(f, fieldnames=submissions[0].keys())
8163
writer.writeheader()
8264
writer.writerows(submissions)
8365

84-
# Save deduplicated submissions
85-
with open("ambassador/submissions_deduped.csv", "w", newline='', encoding="utf-8") as f:
66+
# Write deduplicated CSV
67+
with open("ambassador/submissions_deduplicated.csv", "w", newline='', encoding="utf-8") as f:
8668
writer = csv.DictWriter(f, fieldnames=deduped[0].keys())
8769
writer.writeheader()
8870
writer.writerows(deduped)
8971

90-
# Save duplicates to Excel
72+
# Write duplicates to Excel
9173
if duplicates:
9274
wb = Workbook()
9375
ws = wb.active
94-
ws.title = "Duplicates"
95-
ws.append(duplicates[0].keys())
76+
ws.title = "Duplicates Removed"
77+
ws.append(list(duplicates[0].keys())) # ✅ Fixed here
9678
for row in duplicates:
97-
ws.append([row[k] for k in duplicates[0].keys()])
98-
wb.save("ambassador/submissions_duplicates.xlsx")
79+
ws.append([row.get(k, "") for k in duplicates[0].keys()])
80+
wb.save("ambassador/submissions_duplicates_removed.xlsx")
81+
print("🗂️ Duplicates written to ambassador/submissions_duplicates_removed.xlsx")
9982

100-
print("📁 Files written: submissions_all.csv, submissions_deduped.csv, submissions_duplicates.xlsx")
83+
print("✅ Extraction and deduplication complete.")

0 commit comments

Comments
 (0)