|
2 | 2 | import csv |
3 | 3 | import re |
4 | 4 | from github import Github |
| 5 | +from openpyxl import Workbook |
5 | 6 |
|
6 | | -# Step 0: Setup environment |
| 7 | +# Get GitHub token and repository name |
7 | 8 | GITHUB_TOKEN = os.getenv("GITHUB_TOKEN") |
8 | 9 | GITHUB_REPOSITORY = os.getenv("GITHUB_REPOSITORY") |
9 | 10 |
|
10 | | -# Step 1: Authenticate GitHub |
| 11 | +# Authenticate with GitHub |
11 | 12 | g = Github(GITHUB_TOKEN) |
12 | 13 | repo = g.get_repo(GITHUB_REPOSITORY) |
13 | 14 |
|
14 | | -print("📥 Fetching GitHub issues...") |
15 | | -issues = list(repo.get_issues(state='all', labels=['ambassador'])) |
16 | | -print(f"🔍 Total issues fetched: {len(issues)}") |
| 15 | +print("📥 Fetching open GitHub issues labeled 'ambassador'...") |
| 16 | +issues = list(repo.get_issues(state='open', labels=['ambassador'])) |
| 17 | +print(f"🔍 Total open issues fetched: {len(issues)}") |
17 | 18 |
|
18 | | -# Helper: Extract value from GitHub issue template body |
| 19 | +# Helper to extract a field from the issue body |
19 | 20 | def extract(label, body): |
20 | | - match = re.search(rf"{label}\s*\n\s*(.+)", body) |
| 21 | + match = re.search(rf"{re.escape(label)}\s*\n\s*(.+)", body) |
21 | 22 | return match.group(1).strip() if match else "" |
22 | 23 |
|
23 | | -# Step 2: Extract submission data |
| 24 | +# Extract structured data from each issue |
24 | 25 | submissions = [] |
25 | 26 | for issue in issues: |
26 | 27 | body = issue.body or "" |
27 | 28 | entry = { |
28 | 29 | "Issue #": issue.number, |
29 | 30 | "Nominee Name": extract("Nominee Name", body), |
30 | 31 | "Nominee Email": extract("Nominee Email", body), |
31 | | - "Organization": extract("Organization / Affiliation", body), |
| 32 | + "GitHub Handle": extract("Nominee's GitHub or GitLab Handle", body), |
| 33 | + "Organization": extract("(Optional) Organization / Affiliation", body), |
32 | 34 | "Location": extract("City, State/Province, Country", body), |
33 | | - "Contributions": extract("Relevant Contributions and Links", body), |
34 | | - "Ambassador Pitch": extract("Why do you want to be a PyTorch Ambassador?", body), |
35 | | - "Extra Notes": extract("Additional Notes or Comments", body), |
36 | | - "Nominate Others": extract("I would like to nominate contributors", body), |
37 | | - "Additional Info": extract("Any other information", body) |
| 35 | + "Your Name": extract("Your Name", body), |
| 36 | + "Your Email": extract("Your Email (Optional)", body), |
| 37 | + "Submission Summary": "\n\n".join([ |
| 38 | + f"Nominee Self/Nominated: {extract('Select one:', body)}", |
| 39 | + f"Requirements Acknowledged: {extract('Please confirm that the nominee meets the following requirements:', body)}", |
| 40 | + f"Contributions: {extract('How has the nominee contributed to PyTorch?', body)}", |
| 41 | + f"Ambassador Pitch: {extract('🏆 How Would the Nominee Contribute as an Ambassador?', body)}", |
| 42 | + f"Additional Info: {extract('Any additional details you\'d like to share?', body)}" |
| 43 | + ]) |
38 | 44 | } |
39 | 45 | submissions.append(entry) |
40 | 46 |
|
41 | | -print("🧹 Deduplicating...") |
| 47 | +print("🧹 Deduplicating by email or name...") |
42 | 48 |
|
43 | | -# Step 3: Deduplicate — keep latest per email/name |
| 49 | +# Deduplication logic: keep latest (by issue #), use email if available |
44 | 50 | latest_submissions = {} |
45 | | -seen_keys = set() |
46 | | - |
47 | 51 | for entry in sorted(submissions, key=lambda x: x["Issue #"], reverse=True): |
48 | 52 | key = entry["Nominee Email"].lower() if entry["Nominee Email"] else entry["Nominee Name"].lower() |
49 | | - if key not in latest_submissions: |
| 53 | + if key and key not in latest_submissions: |
50 | 54 | latest_submissions[key] = entry |
51 | | - seen_keys.add(key) |
52 | 55 |
|
53 | 56 | deduped = list(latest_submissions.values()) |
| 57 | +duplicates = [s for s in submissions if s not in deduped] |
54 | 58 |
|
55 | | -# Step 4: Track duplicates |
56 | | -duplicates = [] |
57 | | -seen_keys_copy = seen_keys.copy() # prevent modifying original while checking |
58 | | -for entry in submissions: |
59 | | - key = entry["Nominee Email"].lower() if entry["Nominee Email"] else entry["Nominee Name"].lower() |
60 | | - if key in seen_keys_copy: |
61 | | - seen_keys_copy.remove(key) # keep only the first seen (i.e., latest) |
62 | | - else: |
63 | | - duplicates.append(entry) |
64 | | - |
65 | | -# Step 5: Ensure output directory exists |
66 | | -output_dir = "ambassador/output_step1" |
67 | | -os.makedirs(output_dir, exist_ok=True) |
| 59 | +# Ensure output folder |
| 60 | +os.makedirs("ambassador", exist_ok=True) |
68 | 61 |
|
69 | | -# Step 6: Write full submissions |
70 | | -with open(os.path.join(output_dir, "ambassador_submissions_full.csv"), "w", newline='', encoding="utf-8") as f: |
| 62 | +# Write raw submissions |
| 63 | +with open("ambassador/submissions_all_raw.csv", "w", newline='', encoding="utf-8") as f: |
71 | 64 | writer = csv.DictWriter(f, fieldnames=submissions[0].keys()) |
72 | 65 | writer.writeheader() |
73 | 66 | writer.writerows(submissions) |
74 | 67 |
|
75 | | -# Step 7: Write deduplicated submissions |
76 | | -with open(os.path.join(output_dir, "ambassador_submissions_deduped.csv"), "w", newline='', encoding="utf-8") as f: |
| 68 | +# Write deduplicated submissions |
| 69 | +with open("ambassador/submissions_deduplicated.csv", "w", newline='', encoding="utf-8") as f: |
77 | 70 | writer = csv.DictWriter(f, fieldnames=deduped[0].keys()) |
78 | 71 | writer.writeheader() |
79 | 72 | writer.writerows(deduped) |
80 | 73 |
|
81 | | -# Step 8: Write duplicates removed |
| 74 | +# Write duplicates to Excel if any |
82 | 75 | if duplicates: |
83 | | - with open(os.path.join(output_dir, "duplicates_removed.csv"), "w", newline='', encoding="utf-8") as f: |
84 | | - writer = csv.DictWriter(f, fieldnames=duplicates[0].keys()) |
85 | | - writer.writeheader() |
86 | | - writer.writerows(duplicates) |
87 | | - print(f"🗂️ Duplicates written to {output_dir}/duplicates_removed.csv") |
| 76 | + wb = Workbook() |
| 77 | + ws = wb.active |
| 78 | + ws.title = "Duplicates Removed" |
| 79 | + ws.append(list(duplicates[0].keys())) |
| 80 | + for d in duplicates: |
| 81 | + ws.append([d.get(k, "") for k in ws[1]]) |
| 82 | + wb.save("ambassador/submissions_duplicates_removed.xlsx") |
| 83 | + print("🗂️ Duplicates written to ambassador/submissions_duplicates_removed.xlsx") |
88 | 84 | else: |
89 | 85 | print("✅ No duplicates found.") |
90 | 86 |
|
91 | | -print("✅ Step 1 complete: Extraction + Deduplication done.") |
| 87 | +print("🎉 Done: Data extracted and files saved.") |
0 commit comments