| 
 | 1 | +import os  | 
 | 2 | +import csv  | 
 | 3 | +import re  | 
 | 4 | +from github import Github  | 
 | 5 | +from openpyxl import Workbook  | 
 | 6 | + | 
 | 7 | +# Get GitHub token and repository name  | 
 | 8 | +GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")  | 
 | 9 | +GITHUB_REPOSITORY = os.getenv("GITHUB_REPOSITORY")  | 
 | 10 | + | 
 | 11 | +# Authenticate  | 
 | 12 | +g = Github(GITHUB_TOKEN)  | 
 | 13 | +repo = g.get_repo(GITHUB_REPOSITORY)  | 
 | 14 | + | 
 | 15 | +print("📥 Fetching GitHub issues...")  | 
 | 16 | +issues = list(repo.get_issues(state='all', labels=['ambassador']))  | 
 | 17 | + | 
 | 18 | +print(f"🔍 Total issues fetched: {len(issues)}")  | 
 | 19 | + | 
 | 20 | +# Helper to extract a label's value from issue body  | 
 | 21 | +def extract(label, body):  | 
 | 22 | +    match = re.search(rf"{label}\s*\n\s*(.+)", body)  | 
 | 23 | +    return match.group(1).strip() if match else ""  | 
 | 24 | + | 
 | 25 | +# Extract all relevant data  | 
 | 26 | +submissions = []  | 
 | 27 | +for issue in issues:  | 
 | 28 | +    body = issue.body or ""  | 
 | 29 | +    entry = {  | 
 | 30 | +        "Issue #": issue.number,  | 
 | 31 | +        "Nominee Name": extract("Nominee Name", body),  | 
 | 32 | +        "Nominee Email": extract("Nominee Email", body),  | 
 | 33 | +        "Organization": extract("Organization / Affiliation", body),  | 
 | 34 | +        "Location": extract("City, State/Province, Country", body),  | 
 | 35 | +        "Contributions": extract("Relevant Contributions and Links", body),  | 
 | 36 | +        "Ambassador Pitch": extract("Why do you want to be a PyTorch Ambassador?", body),  | 
 | 37 | +        "Extra Notes": extract("Additional Notes or Comments", body),  | 
 | 38 | +        "Nominate Others": extract("I would like to nominate contributors", body),  | 
 | 39 | +        "Additional Info": extract("Any other information", body)  | 
 | 40 | +    }  | 
 | 41 | +    submissions.append(entry)  | 
 | 42 | + | 
 | 43 | +print("🧹 Deduplicating...")  | 
 | 44 | + | 
 | 45 | +# Deduplication logic: use email if present, fallback to name  | 
 | 46 | +latest_submissions = {}  | 
 | 47 | +for entry in sorted(submissions, key=lambda x: x["Issue #"], reverse=True):  | 
 | 48 | +    key = entry["Nominee Email"].lower() if entry["Nominee Email"] else entry["Nominee Name"].lower()  | 
 | 49 | +    if key not in latest_submissions:  | 
 | 50 | +        latest_submissions[key] = entry  | 
 | 51 | + | 
 | 52 | +deduped = list(latest_submissions.values())  | 
 | 53 | +duplicates = [s for s in submissions if s not in deduped]  | 
 | 54 | + | 
 | 55 | +# Ensure output folder  | 
 | 56 | +os.makedirs("ambassador", exist_ok=True)  | 
 | 57 | + | 
 | 58 | +# Save full submission CSV  | 
 | 59 | +with open("ambassador/ambassador_submissions_full.csv", "w", newline='', encoding="utf-8") as f:  | 
 | 60 | +    writer = csv.DictWriter(f, fieldnames=submissions[0].keys())  | 
 | 61 | +    writer.writeheader()  | 
 | 62 | +    writer.writerows(submissions)  | 
 | 63 | + | 
 | 64 | +# Save deduplicated CSV  | 
 | 65 | +with open("ambassador/ambassador_submissions_deduped.csv", "w", newline='', encoding="utf-8") as f:  | 
 | 66 | +    writer = csv.DictWriter(f, fieldnames=deduped[0].keys())  | 
 | 67 | +    writer.writeheader()  | 
 | 68 | +    writer.writerows(deduped)  | 
 | 69 | + | 
 | 70 | +# Save duplicates to Excel  | 
 | 71 | +if duplicates:  | 
 | 72 | +    wb = Workbook()  | 
 | 73 | +    ws = wb.active  | 
 | 74 | +    ws.title = "Duplicates Removed"  | 
 | 75 | +    ws.append(list(duplicates[0].keys()))  | 
 | 76 | +    for d in duplicates:  | 
 | 77 | +        ws.append([d.get(k, "") for k in ws[1]])  | 
 | 78 | +    wb.save("ambassador/duplicates_removed.xlsx")  | 
 | 79 | +    print(f"🗂️ Duplicates written to ambassador/duplicates_removed.xlsx")  | 
 | 80 | + | 
 | 81 | +print("✅ Step 1 complete: Extraction + Deduplication done.")  | 
0 commit comments