|
2 | 2 | import csv |
3 | 3 | import re |
4 | 4 | from github import Github |
5 | | -from openpyxl import Workbook |
6 | 5 |
|
7 | | -# Get GitHub token and repository name |
| 6 | +# Load secrets |
8 | 7 | GITHUB_TOKEN = os.getenv("GITHUB_TOKEN") |
9 | 8 | GITHUB_REPOSITORY = os.getenv("GITHUB_REPOSITORY") |
10 | 9 |
|
11 | 10 | # Authenticate with GitHub |
12 | 11 | g = Github(GITHUB_TOKEN) |
13 | 12 | repo = g.get_repo(GITHUB_REPOSITORY) |
14 | 13 |
|
15 | | -print("📥 Fetching open GitHub issues labeled 'ambassador'...") |
16 | | -issues = list(repo.get_issues(state='open', labels=['ambassador'])) |
17 | | -print(f"🔍 Total open issues fetched: {len(issues)}") |
| 14 | +print("🔍 Fetching ambassador issues (open only)...") |
| 15 | +issues = repo.get_issues(state="open", labels=["ambassador"]) |
18 | 16 |
|
19 | | -# Helper to extract a field from the issue body |
20 | | -def extract(label, body): |
21 | | - match = re.search(rf"{re.escape(label)}\s*\n\s*(.+)", body) |
| 17 | +submissions = [] |
| 18 | + |
| 19 | +def extract_value(label, body): |
| 20 | + match = re.search(rf"{label}\s*\n\s*(.+?)(?:\n|$)", body) |
22 | 21 | return match.group(1).strip() if match else "" |
23 | 22 |
|
24 | | -# Extract structured data from each issue |
25 | | -submissions = [] |
| 23 | +def extract_checkboxes(body): |
| 24 | + boxes = re.findall(r"- \[x\] (.+)", body, re.IGNORECASE) |
| 25 | + return "\n".join(f"- {b.strip()}" for b in boxes) |
| 26 | + |
26 | 27 | for issue in issues: |
27 | 28 | body = issue.body or "" |
28 | | - entry = { |
| 29 | + |
| 30 | + nominee_name = extract_value("Nominee Name", body) |
| 31 | + nominee_email = extract_value("Nominee Email", body) |
| 32 | + github_handle = extract_value("Nominee's GitHub or GitLab Handle", body) |
| 33 | + organization = extract_value("Organization / Affiliation", body) |
| 34 | + location = extract_value("City, State/Province, Country", body) |
| 35 | + nominator_name = extract_value("Your Name", body) |
| 36 | + nominator_email = extract_value("Your Email", body) |
| 37 | + ambassador_pitch = extract_value("🏆 How Would the Nominee Contribute as an Ambassador?", body) |
| 38 | + additional_info = extract_value("Any additional details you'd like to share?", body) |
| 39 | + contributions = extract_checkboxes(body) |
| 40 | + |
| 41 | + # Compose the Submission Summary |
| 42 | + summary_parts = [] |
| 43 | + if github_handle: |
| 44 | + summary_parts.append(f"GitHub Handle: {github_handle}") |
| 45 | + if contributions: |
| 46 | + summary_parts.append(f"Contributions:\n{contributions}") |
| 47 | + if ambassador_pitch: |
| 48 | + summary_parts.append(f"Ambassador Pitch:\n{ambassador_pitch}") |
| 49 | + if additional_info: |
| 50 | + summary_parts.append(f"Additional Info:\n{additional_info}") |
| 51 | + |
| 52 | + submission_summary = "\n\n".join(summary_parts) |
| 53 | + |
| 54 | + submissions.append({ |
29 | 55 | "Issue #": issue.number, |
30 | | - "Nominee Name": extract("Nominee Name", body), |
31 | | - "Nominee Email": extract("Nominee Email", body), |
32 | | - "GitHub Handle": extract("Nominee's GitHub or GitLab Handle", body), |
33 | | - "Organization": extract("(Optional) Organization / Affiliation", body), |
34 | | - "Location": extract("City, State/Province, Country", body), |
35 | | - "Your Name": extract("Your Name", body), |
36 | | - "Your Email": extract("Your Email (Optional)", body), |
37 | | - "Submission Summary": "\n\n".join([ |
38 | | - f"Nominee Self/Nominated: {extract('Select one:', body)}", |
39 | | - f"Requirements Acknowledged: {extract('Please confirm that the nominee meets the following requirements:', body)}", |
40 | | - f"Contributions: {extract('How has the nominee contributed to PyTorch?', body)}", |
41 | | - f"Ambassador Pitch: {extract('🏆 How Would the Nominee Contribute as an Ambassador?', body)}", |
42 | | - f"Additional Info: {extract('Any additional details you\'d like to share?', body)}" |
43 | | - ]) |
44 | | - } |
45 | | - submissions.append(entry) |
46 | | - |
47 | | -print("🧹 Deduplicating by email or name...") |
48 | | - |
49 | | -# Deduplication logic: keep latest (by issue #), use email if available |
50 | | -latest_submissions = {} |
| 56 | + "Nominee Name": nominee_name, |
| 57 | + "Nominee Email": nominee_email, |
| 58 | + "Organization": organization, |
| 59 | + "Location": location, |
| 60 | + "Nominator Name": nominator_name, |
| 61 | + "Nominator Email": nominator_email, |
| 62 | + "Submission Summary": submission_summary |
| 63 | + }) |
| 64 | + |
| 65 | +print(f"📄 Total submissions found: {len(submissions)}") |
| 66 | +print("🧹 Deduplicating...") |
| 67 | + |
| 68 | +# Deduplicate by email, fallback to name |
| 69 | +deduped = {} |
51 | 70 | for entry in sorted(submissions, key=lambda x: x["Issue #"], reverse=True): |
52 | 71 | key = entry["Nominee Email"].lower() if entry["Nominee Email"] else entry["Nominee Name"].lower() |
53 | | - if key and key not in latest_submissions: |
54 | | - latest_submissions[key] = entry |
| 72 | + if key not in deduped: |
| 73 | + deduped[key] = entry |
55 | 74 |
|
56 | | -deduped = list(latest_submissions.values()) |
57 | | -duplicates = [s for s in submissions if s not in deduped] |
| 75 | +deduped_list = list(deduped.values()) |
| 76 | +duplicates = [s for s in submissions if s not in deduped_list] |
58 | 77 |
|
59 | | -# Ensure output folder |
| 78 | +# Save results |
60 | 79 | os.makedirs("ambassador", exist_ok=True) |
61 | 80 |
|
62 | | -# Write raw submissions |
63 | | -with open("ambassador/submissions_all_raw.csv", "w", newline='', encoding="utf-8") as f: |
| 81 | +with open("ambassador/submissions_all.csv", "w", newline='', encoding="utf-8") as f: |
64 | 82 | writer = csv.DictWriter(f, fieldnames=submissions[0].keys()) |
65 | 83 | writer.writeheader() |
66 | 84 | writer.writerows(submissions) |
67 | 85 |
|
68 | | -# Write deduplicated submissions |
69 | | -with open("ambassador/submissions_deduplicated.csv", "w", newline='', encoding="utf-8") as f: |
70 | | - writer = csv.DictWriter(f, fieldnames=deduped[0].keys()) |
| 86 | +with open("ambassador/submissions_deduped.csv", "w", newline='', encoding="utf-8") as f: |
| 87 | + writer = csv.DictWriter(f, fieldnames=deduped_list[0].keys()) |
71 | 88 | writer.writeheader() |
72 | | - writer.writerows(deduped) |
| 89 | + writer.writerows(deduped_list) |
| 90 | + |
| 91 | +if duplicates: |
| 92 | + with open("ambassador/submissions_duplicates.csv", "w", newline='', encoding="utf-8") as f: |
| 93 | + writer = csv.DictWriter(f, fieldnames=duplicates[0].keys()) |
| 94 | + writer.writeheader() |
| 95 | + writer.writerows(duplicates) |
73 | 96 |
|
74 | | -# Write duplicates to Excel if any |
| 97 | +print("✅ Extraction and deduplication complete.") |
| 98 | +print("📁 Files created in ambassador/:") |
| 99 | +print(" - submissions_all.csv") |
| 100 | +print(" - submissions_deduped.csv") |
75 | 101 | if duplicates: |
76 | | - wb = Workbook() |
77 | | - ws = wb.active |
78 | | - ws.title = "Duplicates Removed" |
79 | | - ws.append(list(duplicates[0].keys())) |
80 | | - for d in duplicates: |
81 | | - ws.append([d.get(k, "") for k in ws[1]]) |
82 | | - wb.save("ambassador/submissions_duplicates_removed.xlsx") |
83 | | - print("🗂️ Duplicates written to ambassador/submissions_duplicates_removed.xlsx") |
84 | | -else: |
85 | | - print("✅ No duplicates found.") |
86 | | - |
87 | | -print("🎉 Done: Data extracted and files saved.") |
| 102 | + print(" - submissions_duplicates.csv") |
0 commit comments