From 3d640057d47e9f45abe590134204e3bbd7caaa85 Mon Sep 17 00:00:00 2001 From: Azax4 Date: Mon, 9 Jun 2025 17:20:04 +0100 Subject: [PATCH 01/17] Added script to update author page issues --- bin/update_author-page_issues.py | 77 ++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 bin/update_author-page_issues.py diff --git a/bin/update_author-page_issues.py b/bin/update_author-page_issues.py new file mode 100644 index 0000000000..71ab42cb87 --- /dev/null +++ b/bin/update_author-page_issues.py @@ -0,0 +1,77 @@ + +"""Usage: update_author-page_issues.py + +Updates all issues containing "Author page:" in the title to follow the latest template + +Set your OS environment variable "GITHUB_TOKEN" to your personal token or hardcode it in the code. Make sure to not reveal it to others! + +""" + +import os +import requests + +# Configuration +GITHUB_TOKEN = os.getenv("GITHUB_TOKEN") #Can hardcode token here +REPO_OWNER = 'acl-org' +REPO_NAME = 'acl-anthology' + +# Base URL +BASE_URL = f'https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}' + +HEADERS = { + 'Authorization': f'token {GITHUB_TOKEN}', + 'Accept': 'application/vnd.github.v3+json' +} + +def get_issues_with_title(title): + issues_url = f'{BASE_URL}/issues' + params = {'state': 'open', 'per_page': 100} + issues = [] + + while issues_url: + response = requests.get(issues_url, headers=HEADERS, params=params) + response.raise_for_status() + data = response.json() + + for issue in data: + if title in issue.get('title', '') and 'pull_request' not in issue: + issues.append(issue) + + issues_url = response.links.get('next', {}).get('url') + + return issues + +def add_comment_to_issue(issue_number, comment): + url = f'{BASE_URL}/issues/{issue_number}/comments' + payload = {'body': comment} + response = requests.post(url, headers=HEADERS, json=payload) + response.raise_for_status() + print(f'Comment added to issue #{issue_number}') + +def edit_body_of_issue(issue_number, new_body): + url = f'https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}' + payload = {'body': new_body} + response = requests.patch(url, headers=HEADERS, json=payload) + response.raise_for_status() + print(f'Edited body of issue (ID: {issue_number}) updated.') + +def main(): + print('🔎 Fetching issues...') + issues = get_issues_with_title("Author Page:") + + for issue in issues: + number = issue["number"] + print(f'---\nProcessing issue #{number}: {issue["title"]}') + + add_comment_to_issue(number, "Hi! We have just added a few new fields to help us manage our database of author pages better. You can see the new fields in the body of the issue. Please fill these out and let us know when done so that we can continue working on your issue. Thank you for your coperation!") + + issue_body = issue["body"] + if "### Author ORCID" not in issue_body: + issue_body_list = issue_body.split("### Type of Author Metadata Correction") + issue_body_list.insert(1, "### Author ORCID\n\n-Add ORCID here-\n\n### Institution of highest (anticipated) degree\n\n-Add insitution here-\n\n### Author Name (only if published in another script)\n\n -add author name here if needed-\n\n### Is the authors name read right to left? (only if published in another script)\n\n- [ ] Script is read right-to-left.\n\n### Type of Author Metadata Correction") + issue_body = "".join(issue_body_list) + edit_body_of_issue(number, issue_body) + + +if __name__ == '__main__': + main() From fdfca82850e5f62ffa3e483540588152142fb759 Mon Sep 17 00:00:00 2001 From: Azax4 Date: Mon, 9 Jun 2025 17:21:55 +0100 Subject: [PATCH 02/17] Updated author issue template --- .github/ISSUE_TEMPLATE/02-name-correction.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/02-name-correction.yml b/.github/ISSUE_TEMPLATE/02-name-correction.yml index e95b8fd1a5..bb82aaa6f9 100644 --- a/.github/ISSUE_TEMPLATE/02-name-correction.yml +++ b/.github/ISSUE_TEMPLATE/02-name-correction.yml @@ -25,7 +25,7 @@ body: validations: required: true - - type: textarea + - type: input id: author_orcid attributes: label: Author ORCID @@ -34,7 +34,7 @@ body: placeholder: ex. https://orcid.org/my-orcid?orcid=0009-0003-8868-7504 validations: required: true - - type: textarea + - type: input id: author_highest_degree_institution attributes: label: Institution of highest (anticipated) degree @@ -45,7 +45,7 @@ body: placeholder: ex. Johns Hopkins University (https://www.jhu.edu/) validations: required: true - - type: textarea + - type: input id: author_name_script_variant attributes: label: Author Name (only if published in another script) From a342362613508d635518c18df829cf7837a6f79d Mon Sep 17 00:00:00 2001 From: Azax4 Date: Mon, 9 Jun 2025 17:25:33 +0100 Subject: [PATCH 03/17] Minor change No need to add comment if body does not need to be updated --- bin/update_author-page_issues.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bin/update_author-page_issues.py b/bin/update_author-page_issues.py index 71ab42cb87..ff807c38cf 100644 --- a/bin/update_author-page_issues.py +++ b/bin/update_author-page_issues.py @@ -63,7 +63,7 @@ def main(): number = issue["number"] print(f'---\nProcessing issue #{number}: {issue["title"]}') - add_comment_to_issue(number, "Hi! We have just added a few new fields to help us manage our database of author pages better. You can see the new fields in the body of the issue. Please fill these out and let us know when done so that we can continue working on your issue. Thank you for your coperation!") + issue_body = issue["body"] if "### Author ORCID" not in issue_body: @@ -71,6 +71,8 @@ def main(): issue_body_list.insert(1, "### Author ORCID\n\n-Add ORCID here-\n\n### Institution of highest (anticipated) degree\n\n-Add insitution here-\n\n### Author Name (only if published in another script)\n\n -add author name here if needed-\n\n### Is the authors name read right to left? (only if published in another script)\n\n- [ ] Script is read right-to-left.\n\n### Type of Author Metadata Correction") issue_body = "".join(issue_body_list) edit_body_of_issue(number, issue_body) + + add_comment_to_issue(number, "Hi! We have just added a few new fields to help us manage our database of author pages better. You can see the new fields in the body of the issue. Please fill these out and let us know when done so that we can continue working on your issue. Thank you for your coperation!") if __name__ == '__main__': From 6064ae0e788729d6accb3ed3ce324735ef202b75 Mon Sep 17 00:00:00 2001 From: Matt Post Date: Tue, 10 Jun 2025 17:01:53 -0400 Subject: [PATCH 04/17] minor changes --- bin/update_author-page_issues.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/update_author-page_issues.py b/bin/update_author-page_issues.py index ff807c38cf..253aa37446 100644 --- a/bin/update_author-page_issues.py +++ b/bin/update_author-page_issues.py @@ -68,11 +68,11 @@ def main(): issue_body = issue["body"] if "### Author ORCID" not in issue_body: issue_body_list = issue_body.split("### Type of Author Metadata Correction") - issue_body_list.insert(1, "### Author ORCID\n\n-Add ORCID here-\n\n### Institution of highest (anticipated) degree\n\n-Add insitution here-\n\n### Author Name (only if published in another script)\n\n -add author name here if needed-\n\n### Is the authors name read right to left? (only if published in another script)\n\n- [ ] Script is read right-to-left.\n\n### Type of Author Metadata Correction") + issue_body_list.insert(1, "### Author ORCID\n\n-Add ORCID here-\n\n### Institution of highest (anticipated) degree\n\n-Add insitution here-\n\n### Author Name (only if published in another script)\n\n -add author name here if needed-\n\n### Is the author's name read right to left? (only if published in another script)\n\n- [ ] Script is read right-to-left.\n\n### Type of Author Metadata Correction") issue_body = "".join(issue_body_list) edit_body_of_issue(number, issue_body) - add_comment_to_issue(number, "Hi! We have just added a few new fields to help us manage our database of author pages better. You can see the new fields in the body of the issue. Please fill these out and let us know when done so that we can continue working on your issue. Thank you for your coperation!") + add_comment_to_issue(number, "Hi! We have just added a few new fields to help us manage our author database and decrease ambiguity on future imports. Please fill out the fields that have been added to the body of this issue, and leave a comment when you are finished. We will then proceed with processing this request.") if __name__ == '__main__': From 8a8676ba57fa3c46ef5067332ea68ccd81512a2c Mon Sep 17 00:00:00 2001 From: Matt Post Date: Mon, 18 Aug 2025 18:10:22 -0400 Subject: [PATCH 05/17] Update script --- ...issues.py => update_author_page_issues.py} | 63 ++++++++++++++++--- 1 file changed, 53 insertions(+), 10 deletions(-) rename bin/{update_author-page_issues.py => update_author_page_issues.py} (50%) mode change 100644 => 100755 diff --git a/bin/update_author-page_issues.py b/bin/update_author_page_issues.py old mode 100644 new mode 100755 similarity index 50% rename from bin/update_author-page_issues.py rename to bin/update_author_page_issues.py index 253aa37446..d36b7fae30 --- a/bin/update_author-page_issues.py +++ b/bin/update_author_page_issues.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 """Usage: update_author-page_issues.py @@ -8,10 +9,11 @@ """ import os +import textwrap import requests # Configuration -GITHUB_TOKEN = os.getenv("GITHUB_TOKEN") #Can hardcode token here +GITHUB_TOKEN = os.getenv("GITHUB_TOKEN") # Can hardcode token here REPO_OWNER = 'acl-org' REPO_NAME = 'acl-anthology' @@ -20,9 +22,10 @@ HEADERS = { 'Authorization': f'token {GITHUB_TOKEN}', - 'Accept': 'application/vnd.github.v3+json' + 'Accept': 'application/vnd.github.v3+json', } + def get_issues_with_title(title): issues_url = f'{BASE_URL}/issues' params = {'state': 'open', 'per_page': 100} @@ -41,6 +44,7 @@ def get_issues_with_title(title): return issues + def add_comment_to_issue(issue_number, comment): url = f'{BASE_URL}/issues/{issue_number}/comments' payload = {'body': comment} @@ -48,6 +52,7 @@ def add_comment_to_issue(issue_number, comment): response.raise_for_status() print(f'Comment added to issue #{issue_number}') + def edit_body_of_issue(issue_number, new_body): url = f'https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/issues/{issue_number}' payload = {'body': new_body} @@ -55,25 +60,63 @@ def edit_body_of_issue(issue_number, new_body): response.raise_for_status() print(f'Edited body of issue (ID: {issue_number}) updated.') -def main(): + +def main(issue_ids): print('🔎 Fetching issues...') - issues = get_issues_with_title("Author Page:") + issues = get_issues_with_title("Author Page:") + get_issues_with_title("Author Metadata:") + print(f"Found {len(issues)} issues.") for issue in issues: number = issue["number"] + + if issue_ids and number not in issue_ids: + # print(f"Skipping issue #{number}: {issue['title']}") + continue + print(f'---\nProcessing issue #{number}: {issue["title"]}') - - issue_body = issue["body"] if "### Author ORCID" not in issue_body: issue_body_list = issue_body.split("### Type of Author Metadata Correction") - issue_body_list.insert(1, "### Author ORCID\n\n-Add ORCID here-\n\n### Institution of highest (anticipated) degree\n\n-Add insitution here-\n\n### Author Name (only if published in another script)\n\n -add author name here if needed-\n\n### Is the author's name read right to left? (only if published in another script)\n\n- [ ] Script is read right-to-left.\n\n### Type of Author Metadata Correction") + issue_body_list.insert( + 1, + textwrap.dedent(""" + ### Author ORCID + + -Add ORCID here- + + ### Institution of highest (anticipated) degree + + -Add insitution here- + + ### Your papers (if required, see comment below) + + -Provide Anthology IDs or Anthology URLs here- + + ### Type of Author Metadata Correction + """), + ) issue_body = "".join(issue_body_list) edit_body_of_issue(number, issue_body) - - add_comment_to_issue(number, "Hi! We have just added a few new fields to help us manage our author database and decrease ambiguity on future imports. Please fill out the fields that have been added to the body of this issue, and leave a comment when you are finished. We will then proceed with processing this request.") + + add_comment_to_issue( + number, + textwrap.dedent(""" + Hello: we are attempting to close out a large backlog of author page requests. As part of these efforts, + we are collecting additional information which will help us assign papers to the correct author + in the future. Please modify the updated description above with the requested information. + + If you are requesting to split an author page (i.e., your page has some papers that are not yours), + please also provide a list of your papers, in the form of Anthology IDs or URLS + (e.g., 2023.wmt-1.13 or https://aclanthology.org/2023.wmt-1.13/). + """) + ) if __name__ == '__main__': - main() + import argparse + parser = argparse.ArgumentParser(description='Update author page issues') + parser.add_argument('issue_ids', nargs='+', type=int, help='List of issue IDs to update') + args = parser.parse_args() + + main(args.issue_ids) From 81f2a6b395a0296501816b8537347efc6b770a0f Mon Sep 17 00:00:00 2001 From: Matt Post Date: Mon, 18 Aug 2025 18:16:05 -0400 Subject: [PATCH 06/17] Small updates --- bin/update_author_page_issues.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bin/update_author_page_issues.py b/bin/update_author_page_issues.py index d36b7fae30..04bce1b224 100755 --- a/bin/update_author_page_issues.py +++ b/bin/update_author_page_issues.py @@ -63,7 +63,7 @@ def edit_body_of_issue(issue_number, new_body): def main(issue_ids): print('🔎 Fetching issues...') - issues = get_issues_with_title("Author Page:") + get_issues_with_title("Author Metadata:") + issues = get_issues_with_title("Author Metadata:") + get_issues_with_title("Author Page:") print(f"Found {len(issues)} issues.") for issue in issues: @@ -103,8 +103,9 @@ def main(issue_ids): number, textwrap.dedent(""" Hello: we are attempting to close out a large backlog of author page requests. As part of these efforts, - we are collecting additional information which will help us assign papers to the correct author - in the future. Please modify the updated description above with the requested information. + we are collecting additional information ([ORCID](https://orcid.org/) and degree institution) which will + help us assign papers to the correct author in the future. Please modify the updated description above + with the requested information. If you are requesting to split an author page (i.e., your page has some papers that are not yours), please also provide a list of your papers, in the form of Anthology IDs or URLS From 130ee5e2fa5d70403efd2b8cf6cc5c4b3975f1bc Mon Sep 17 00:00:00 2001 From: Matt Post Date: Mon, 18 Aug 2025 18:21:23 -0400 Subject: [PATCH 07/17] More fiddling with the message --- bin/update_author_page_issues.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/bin/update_author_page_issues.py b/bin/update_author_page_issues.py index 04bce1b224..37cbbc86d0 100755 --- a/bin/update_author_page_issues.py +++ b/bin/update_author_page_issues.py @@ -68,12 +68,13 @@ def main(issue_ids): for issue in issues: number = issue["number"] + title = issue["title"] if issue_ids and number not in issue_ids: # print(f"Skipping issue #{number}: {issue['title']}") continue - print(f'---\nProcessing issue #{number}: {issue["title"]}') + print(f'---\nProcessing issue #{number}: {title}') issue_body = issue["body"] if "### Author ORCID" not in issue_body: @@ -102,14 +103,9 @@ def main(issue_ids): add_comment_to_issue( number, textwrap.dedent(""" - Hello: we are attempting to close out a large backlog of author page requests. As part of these efforts, - we are collecting additional information ([ORCID](https://orcid.org/) and degree institution) which will - help us assign papers to the correct author in the future. Please modify the updated description above - with the requested information. - - If you are requesting to split an author page (i.e., your page has some papers that are not yours), - please also provide a list of your papers, in the form of Anthology IDs or URLS - (e.g., 2023.wmt-1.13 or https://aclanthology.org/2023.wmt-1.13/). + Hello: we are attempting to close out a large backlog of author page requests. As part of these efforts, we are collecting additional information ([ORCID](https://orcid.org/) and degree institution) which will help us assign papers to the correct author in the future. Please modify the updated description above with the requested information. + + If you are requesting to split an author page (i.e., your page has some papers that are not yours), please also provide a list of your papers, in the form of Anthology IDs or URLS (e.g., 2023.wmt-1.13 or https://aclanthology.org/2023.wmt-1.13/). We are unable to match papers to Google or Semantic Scholar profiles. """) ) @@ -117,7 +113,7 @@ def main(issue_ids): if __name__ == '__main__': import argparse parser = argparse.ArgumentParser(description='Update author page issues') - parser.add_argument('issue_ids', nargs='+', type=int, help='List of issue IDs to update') + parser.add_argument('issue_ids', nargs='*', type=int, help='List of issue IDs to update') args = parser.parse_args() main(args.issue_ids) From 870a09c0028e6fa81bfae4e047a73507bbf32a86 Mon Sep 17 00:00:00 2001 From: Matt Post Date: Mon, 18 Aug 2025 18:21:49 -0400 Subject: [PATCH 08/17] black --- bin/update_author_page_issues.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/bin/update_author_page_issues.py b/bin/update_author_page_issues.py index 37cbbc86d0..ff8cd1208b 100755 --- a/bin/update_author_page_issues.py +++ b/bin/update_author_page_issues.py @@ -63,7 +63,9 @@ def edit_body_of_issue(issue_number, new_body): def main(issue_ids): print('🔎 Fetching issues...') - issues = get_issues_with_title("Author Metadata:") + get_issues_with_title("Author Page:") + issues = get_issues_with_title("Author Metadata:") + get_issues_with_title( + "Author Page:" + ) print(f"Found {len(issues)} issues.") for issue in issues: @@ -81,7 +83,8 @@ def main(issue_ids): issue_body_list = issue_body.split("### Type of Author Metadata Correction") issue_body_list.insert( 1, - textwrap.dedent(""" + textwrap.dedent( + """ ### Author ORCID -Add ORCID here- @@ -95,25 +98,31 @@ def main(issue_ids): -Provide Anthology IDs or Anthology URLs here- ### Type of Author Metadata Correction - """), + """ + ), ) issue_body = "".join(issue_body_list) edit_body_of_issue(number, issue_body) add_comment_to_issue( number, - textwrap.dedent(""" + textwrap.dedent( + """ Hello: we are attempting to close out a large backlog of author page requests. As part of these efforts, we are collecting additional information ([ORCID](https://orcid.org/) and degree institution) which will help us assign papers to the correct author in the future. Please modify the updated description above with the requested information. If you are requesting to split an author page (i.e., your page has some papers that are not yours), please also provide a list of your papers, in the form of Anthology IDs or URLS (e.g., 2023.wmt-1.13 or https://aclanthology.org/2023.wmt-1.13/). We are unable to match papers to Google or Semantic Scholar profiles. - """) + """ + ), ) if __name__ == '__main__': import argparse + parser = argparse.ArgumentParser(description='Update author page issues') - parser.add_argument('issue_ids', nargs='*', type=int, help='List of issue IDs to update') + parser.add_argument( + 'issue_ids', nargs='*', type=int, help='List of issue IDs to update' + ) args = parser.parse_args() main(args.issue_ids) From e9ec27420735be0d1cba488717c9ba66e5163bcf Mon Sep 17 00:00:00 2001 From: Matt Post Date: Wed, 20 Aug 2025 09:43:05 -0400 Subject: [PATCH 09/17] Add instructions --- .github/copilot-instructions.md | 133 +++++++ .../process-author-page.instructions.md | 343 ++++++++++++++++++ bin/add_author_id.py | 15 +- 3 files changed, 485 insertions(+), 6 deletions(-) create mode 100644 .github/copilot-instructions.md create mode 100644 .github/instructions/process-author-page.instructions.md diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 0000000000..2ae96e8d5b --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,133 @@ +# ACL Anthology Copilot Instructions + +## Project Overview +The ACL Anthology is a digital archive of NLP/CL research papers with both a static website generator and a Python package for metadata access. The project manages scholarly publication metadata through XML files and generates a Hugo-based website. + +## Architecture & Data Flow + +### Core Data Model +- **Authoritative XML files** in `data/xml/` contain all paper metadata (schema: `data/xml/schema.rnc`) +- **YAML configuration** in `data/yaml/` defines venues, SIGs, and name variants +- **Hugo static site** generated from processed JSON data in `build/data/` +- **Python package** (`python/acl_anthology/`) provides programmatic access to metadata + +### Build Process Pipeline +1. **XML Processing**: `bin/create_hugo_data.py` converts XML → JSON for Hugo templates +2. **Bibliography Generation**: `bin/create_extra_bib.py` creates BibTeX/MODS/Endnote exports +3. **Hugo Site Generation**: Hugo processes JSON data → static HTML site +4. **Asset Management**: PDF files, attachments managed separately with checksums + +Key build targets in `Makefile`: +- `make all` - Full build (check + site) +- `make hugo_data` - Generate JSON data files only +- `make site` - Generate complete website +- `make check` - Validate XML schema compliance + +## Critical ID System + +### Modern Format (post-2020) +- Format: `YEAR.VENUE-VOLUME.NUMBER` (e.g., `2020.acl-main.12`) +- **VENUE**: lowercase alphanumeric venue identifier (no years!) +- **VOLUME**: volume name (`main`, `short`, `1`, etc.) +- **NUMBER**: paper number within volume + +### Legacy Format (pre-2020) +- Various letter-based schemes (P19-1234, W19-5012, etc.) +- Limited paper capacity, inflexible venue encoding + +## Development Workflows + +### XML Metadata Management +- All paper metadata lives in `data/xml/{COLLECTION_ID}.xml` files +- Use `bin/ingest_aclpub2.py` for bulk ingestion from conference data +- Individual modifications via scripts like `bin/add_author_id.py`, `bin/fix_titles.py` +- **Always validate with `make check`** after XML changes + +### Author Name Handling +- Complex disambiguation system for author identity resolution +- Name variants stored in `data/yaml/name_variants.yaml` +- Scripts: `bin/find_name_variants.py`, `bin/auto_name_variants.py` +- Person IDs assigned automatically but can be explicitly set + +### Testing Strategy +```bash +# Python package tests +cd python && poetry run pytest + +# Full site build test +make check site + +# Integration tests on full data +pytest -m integration +``` + +## Project-Specific Patterns + +### XML Structure Philosophy +- **Separation of content and presentation**: Raw metadata in XML, formatting via Hugo templates +- **Hierarchical organization**: Collections → Volumes → Papers +- **Checksum validation**: All file references include SHA-256 checksums (8-char prefix) + +### Script Naming Conventions +- `add_*.py` - Add new metadata fields +- `fix_*.py` - Correct existing data +- `ingest_*.py` - Import data from external sources +- `create_*.py` - Generate derived files + +### Hugo Data Export Pattern +```python +# All export scripts follow this pattern: +def export_ENTITY(anthology, builddir, dryrun): + # Process anthology data + data = {...} + if not dryrun: + with open(f"{builddir}/data/{entity}.json", "wb") as f: + f.write(ENCODER.encode(data)) +``` + +## Environment Setup + +### Dependencies +- **Python 3.10+** with packages from `bin/requirements.txt` +- **Hugo 0.126.0+** (extended version required) +- **bibutils** for citation format conversion +- **jing** for XML validation + +### Development Commands +```bash +# Setup environment +python3 -m venv venv && source venv/bin/activate +pip install -r bin/requirements.txt + +# Quick data regeneration (development) +make NOBIB=true hugo_data hugo + +# Full production build +make all +``` + +## Key Integration Points + +### External Data Sources +- **ACLPub2**: Conference management system data ingestion +- **Papers with Code**: Research code linking +- **CrossRef**: DOI metadata synchronization +- **Google Scholar**: Author profile integration + +### File Management +- PDFs and attachments stored separately from metadata +- Environment variables: `ANTHOLOGY_PREFIX`, `ANTHOLOGYFILES` +- Symlinked as `anthology-files` in generated site + +## Common Pitfalls +- **Never include years in venue identifiers** - venues are persistent entities +- **XML changes require `make check`** - schema validation is mandatory +- **Author name disambiguation is automatic** - manual overrides via explicit IDs only +- **Hugo memory usage is ~18GB** - normal on large sites, may cause swapping +- **Venue vs Event confusion** - venues are persistent, events are year-specific instances + +## File Locations for Common Tasks +- **Add new venue**: `data/yaml/venues/{venue-id}.yaml` +- **Fix paper metadata**: Edit `data/xml/{collection}.xml` directly +- **Modify site templates**: `hugo/layouts/` +- **Update build process**: `Makefile` and `bin/create_hugo_data.py` diff --git a/.github/instructions/process-author-page.instructions.md b/.github/instructions/process-author-page.instructions.md new file mode 100644 index 0000000000..0bb50cfe73 --- /dev/null +++ b/.github/instructions/process-author-page.instructions.md @@ -0,0 +1,343 @@ +--- +applyTo: 'data/xml/*.xml' +--- + +# Processing ACL Anthology Author Page Issues + +This guide provides instructions for processing GitHub issues requesting author page corrections in the ACL Anthology. There are two types of requests: **merging** and **splitting** author pages. + +## Prerequisites & Requirements + +All author page requests **must** include: +- **Valid ORCID ID** (format: `0000-0000-0000-0000`) +- **Institution** where highest (anticipated) degree was/will be obtained +- **Clear identification** of which papers belong to the author + +## Workflow Overview + +1. **Setup**: Ensure master branch is up to date, create working branch +2. **Process**: Make required changes based on request type (merge or split) +3. **Validate**: Run checks to ensure changes are correct +4. **Submit**: Commit changes and create PR referencing the issue + +## Initial Setup + +### 1. Update and Create Branch + +```bash +# Ensure master is up to date +git checkout master +git pull origin master + +# Create branch using the pattern: author-page-{author_id} +git checkout -b author-page-{author_id} +``` + +**Branch naming examples**: +- Merge: `author-page-matt-post` +- Split: `author-page-matt-post-rochester` + +## Request Type 1: Merging Author Pages + +**Use case**: Author has published under multiple name variants and wants them consolidated under a canonical name. + +**Example**: "Matt Post" and "Matthew Post" should be merged under "Matt Post". + +### Steps: + +1. **Add entry to `data/yaml/name_variants.yaml`**: + ```yaml + - canonical: {first: Matt, last: Post} + orcid: 0000-0000-0000-0000 + institution: Johns Hopkins University # Include even though not currently used + variants: + - {first: Matthew, last: Post} + ``` + +2. **Important notes**: + - Canonical name should be the author's preferred variant + - Include all name variants found in the XML files + - The `institution` field should be included for future use + +## Request Type 2: Splitting Author Pages + +**Use case**: Multiple authors published under the same name and need to be separated. + +**Example**: Papers under "Matt Post" are actually from different people - separate out the papers belonging to the requester. + +### Steps: + +#### 2.1 Create Author ID for Requester + +Add entry to `data/yaml/name_variants.yaml`: +```yaml +- canonical: {first: Matt, last: Post} + id: matt-post-rochester # Format: firstname-lastname-institution + orcid: 0000-0000-0000-0000 + institution: University of Rochester +``` + +**ID format rules**: +- Lowercase only +- Hyphens replace spaces +- Use recognizable institution abbreviation +- Examples: `yang-liu-umich`, `john-smith-stanford`, `jane-doe-google` + +#### 2.2 Tag Author's Papers + +Use the `bin/add_author_id.py` script to efficiently add the ID to all papers belonging to the requester: + +```bash +# Add ID to all papers by the author's first and last name +bin/add_author_id.py matt-post-rochester --first-name "Matt" --last-name "Post" +``` + +This will add the `id` attribute to matching `` tags: + +```xml + +MattPost + + +MattPost +``` + +**Note**: The script automatically maintains proper XML formatting and preserves indentation. + +#### 2.3 Handle Remaining Papers + +For papers that don't belong to the requester (the "other" Matt Post): + +If there is no entry in the YAML file, create one. +```yaml +- canonical: {first: Matt, last: Post} + id: matt-post + comment: "May refer to several people" +``` + +Then, use the `bin/add_author_id.py` script to efficiently add the ID to all untagged papers: + +```bash +# Add ID to all papers by the author's first and last name +bin/add_author_id.py matt-post --first-name "Matt" --last-name "Post" +``` + +### Helper Tools + +- `bin/add_author_id.py author-id --last-name "LastName"` - Bulk add ID to matching authors +- `bin/add_explicit_author_id.py` - Add IDs based on existing disambiguation + +## Validation & Testing + +### Required Checks + +```bash +# Validate XML schema compliance +make check + +# Test data generation +make hugo_data +``` + +### Common Issues to Avoid + +- **Invalid ORCID format**: Must be exactly `0000-0000-0000-0000` +- **XML formatting**: Don't break single-line `` tags into multiple lines +- **Duplicate IDs**: Ensure new author IDs are unique +- **Missing canonical**: Canonical name must match one existing name variant + +## File Locations + +- **Name variants**: `data/yaml/name_variants.yaml` +- **Paper metadata**: `data/xml/{year}.{venue}.xml` (e.g., `2020.acl-main.xml`) + +## Examples + +### Merge Example +```yaml +# Merging "John P. Smith" and "John Smith" +- canonical: {first: John P., last: Smith} + orcid: 0000-0002-1234-5678 + institution: Stanford University + variants: + - {first: John, last: Smith} + - {first: J. P., last: Smith} +``` + +### Split Example +```yaml +# Splitting "Yang Liu" - requester from University of Michigan +- canonical: {first: Yang, last: Liu} + id: yang-liu-umich + orcid: 0000-0003-1234-5678 + institution: University of Michigan + +# Generic entry for remaining papers +- canonical: {first: Yang, last: Liu} + id: yang-liu + comment: "May refer to several people" +``` + +## Completion + +### 1. Commit Changes + +```bash +# Add all modified files +git add data/yaml/name_variants.yaml data/xml/*.xml + +# Commit with reference to issue number +git commit -m "Process author page request for {Author Name} + +Closes #{issue_number} + +- {Brief description of changes made} +" + +# Push branch +git push origin author-page-{author_id} +``` + +### 2. Create Pull Request + +- **Title**: `Author page: {Author Name} ({merge|split})` +- **Body**: Reference the GitHub issue number and summarize changes +- **Labels**: Add appropriate labels (`author-page`, `merge` or `split`) + +The PR will trigger automated builds and tests. Once merged, the changes will be reflected in the next site build. +For each paper belonging to the disambiguated author, add `id` attribute to XML: + +**Example**: In `data/xml/2020.acl.xml`: +```xml + + YangLiu + + +``` + +**Formatting Requirements**: +- Keep `` tags on single line (don't expand to multiple lines) +- Preserve existing indentation and spacing patterns +- Use existing XML formatting tools to maintain consistency + +**Tools available**: +- `bin/add_author_id.py author-id --last-name "LastName"` - Bulk add ID to author +- `bin/add_explicit_author_id.py` - Add IDs based on existing disambiguation + +#### 4.3 Handle Remaining Papers +For papers that don't belong to the author with the explicit ID: +1. **Option A**: Leave unchanged (they remain under generic ID) +2. **Option B**: Create another explicit ID for the other author if requested + +#### 4.4 Update Similar Authors (if applicable) +If multiple authors have similar names, add `similar` field: +```yaml +- canonical: {first: Yang, last: Liu} + id: yang-liu-umich + orcid: 0000-0000-0000-0000 + similar: [yang-liu-edinburgh, yang-liu-pk] +``` + +## ID Generation Rules + +### Author ID Format +- **Structure**: `firstname-lastname-institution` +- **Rules**: + - Lowercase only + - Hyphens replace spaces and special characters + - Institution should be recognizable abbreviation + - Examples: `yang-liu-umich`, `john-smith-stanford` + +### Institution Abbreviations +Common patterns: +- Universities: `umich`, `stanford`, `cmu`, `mit` +- Companies: `google`, `microsoft`, `facebook` +- Use domain-based abbreviations when possible + +## Validation and Testing + +### Required Checks +```bash +# Validate XML schema compliance +make check + +# Test site generation with changes +make hugo_data + +# Full build test (if making significant changes) +make site +``` + +### Formatting Consistency +- **XML**: Preserve single-line formatting for `` and `` tags +- **YAML**: Follow existing indentation (2 spaces) and structure in `name_variants.yaml` +- **Use project tools**: Scripts like `bin/add_author_id.py` maintain proper formatting automatically +- **Indentation**: Use `anthology.utils.indent()` function for XML pretty-printing when needed + +### Common Issues +- **Invalid ORCID format**: Must be `0000-0000-0000-0000` +- **XML schema violations**: Missing required fields, invalid nesting +- **Name mismatches**: Canonical name not matching any existing papers +- **Duplicate IDs**: Ensure new author IDs are unique + +## File Locations + +- **Name variants**: `data/yaml/name_variants.yaml` +- **XML metadata**: `data/xml/{collection}.xml` (e.g., `2020.acl.xml`) +- **Validation script**: `make check` +- **Author ID tools**: `bin/add_author_id.py`, `bin/add_explicit_author_id.py` + +## Examples + +### Merge Example +```yaml +# Merging "John P. Smith" and "John Smith" profiles +- canonical: {first: John P., last: Smith} + orcid: 0000-0002-1234-5678 + variants: + - {first: John, last: Smith} + - {first: J. P., last: Smith} +``` + +### Split Example +```yaml +# Splitting "Yang Liu" into institution-specific profiles +- canonical: {first: Yang, last: Liu} + id: yang-liu-umich + orcid: 0000-0003-1234-5678 + comment: University of Michigan + similar: [yang-liu-edinburgh] +``` + +With corresponding XML updates: +```xml +YangLiu +``` + +**Note**: Maintain single-line format for author tags as shown above. + +## Post-Processing + +### 1. Commit and Push Changes +```bash +# Add all changes +git add data/yaml/name_variants.yaml data/xml/*.xml + +# Commit with descriptive message +git commit -m "Author page correction: {author-name} ({merge|split})" + +# Push branch +git push origin author-page-{authorid} +``` + +### 2. Create Pull Request +- **Title**: `Author page correction: {Author Name} ({merge|split})` +- **Body**: Include link to original GitHub issue and summary of changes +- **Labels**: Add `correction`, `metadata` labels +- **Assignees**: Add `anthology-assist` + +### 3. Post-PR Actions +1. **Update GitHub issue**: Comment with link to PR and close original issue +2. **Monitor build**: Ensure site builds successfully after merge +3. **Verify author pages**: Check that author pages display correctly on staging/live site +4. **Archive decision**: Document rationale for complex disambiguation cases diff --git a/bin/add_author_id.py b/bin/add_author_id.py index d0c8dbc262..d8e166bc8b 100755 --- a/bin/add_author_id.py +++ b/bin/add_author_id.py @@ -24,12 +24,13 @@ Usage: - ./add_author_id.py bill-byrne --last-name Byrne + ./add_author_id.py bill-byrne --last-name Byrne --first-name Bill """ import argparse import os +from pathlib import Path from anthology.utils import indent from itertools import chain @@ -37,10 +38,7 @@ def main(args): - for xml_file in os.listdir(args.data_dir): - if not xml_file.endswith(".xml"): - continue - + for xml_file in Path(args.data_dir).glob("**/*.xml"): changed_one = False tree = ET.parse(xml_file) @@ -53,7 +51,11 @@ def main(args): if "id" in author_xml.attrib: continue last_name = author_xml.find("./last").text - if last_name == args.last_name: + try: + first_name = author_xml.find("./first").text + except AttributeError: + first_name = "" + if last_name == args.last_name and first_name == args.first_name: paper_id = ( paper_xml.attrib["id"] if paper_xml.text == "paper" else "0" ) @@ -71,6 +73,7 @@ def main(args): parser = argparse.ArgumentParser() parser.add_argument("id", help="Author ID to add") parser.add_argument("--last-name", help="Author's last name") + parser.add_argument("--first-name", help="Author's first name") parser.add_argument("--confirm", action="store_true", help="Confirm each instance") parser.add_argument( "--data-dir", default=os.path.join(os.path.dirname(__file__), "..", "data", "xml") From 3271c3da2528aaed7ee0fb0393cd0ac344fe73a0 Mon Sep 17 00:00:00 2001 From: Matt Post Date: Wed, 20 Aug 2025 19:08:44 -0400 Subject: [PATCH 10/17] Rework script to use module --- bin/add_author_id.py | 117 ++++++++++++++++++++----------------------- 1 file changed, 53 insertions(+), 64 deletions(-) diff --git a/bin/add_author_id.py b/bin/add_author_id.py index d8e166bc8b..d3fbb077e7 100755 --- a/bin/add_author_id.py +++ b/bin/add_author_id.py @@ -1,83 +1,72 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# -# Copyright 2022 Matt Post -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# -*- coding: utf-8 -*- +"""Add an author ID to NameSpecification entries using the acl_anthology module. +This script finds author/editor name specifications matching a given +first/last name where no explicit ID is present, sets the provided ID, and +saves the affected collection XML files. + +Usage: + ./add_author_id.py --last-name [--first-name ] [--data-dir ] """ -Adds an ID tag to all instances of an author in all XML files where there is no ID tag. -First use case was the Bill Byrne separation of July 2022. +from __future__ import annotations -2020.gebnlp-1.4 E14-1026 E14-1028 W16-2324 2021.acl-long.55 2021.eancs-1.2 W15-0116 D19-1125 D19-1331 D19-1459 P14-3000 2022.naacl-main.136 W18-1821 W18-5420 W18-6427 2020.nlp4call-1.2 N19-1406 2021.emnlp-main.620 2021.emnlp-main.666 N18-2081 N18-3013 W17-3531 2020.wmt-1.94 D15-1273 2022.nlp4convai-1.7 P16-2049 C14-1195 P19-1022 W19-4417 W19-4424 W19-5340 W19-5421 2020.wat-1.21 E17-2058 2022.ecnlp-1.13 J14-3008 N15-1041 N15-1105 P18-2051 D17-1208 D17-1220 D17-2005 2020.acl-main.690 2020.acl-main.693 N16-1100 2022.findings-acl.223 2022.findings-acl.301 +import argparse +from collections import defaultdict +from itertools import chain +from typing import Set -Usage: +from acl_anthology.anthology import Anthology +from acl_anthology.people import Name - ./add_author_id.py bill-byrne --last-name Byrne --first-name Bill -""" -import argparse -import os +def main(args: argparse.Namespace) -> None: + anthology = Anthology(args.data_dir, verbose=True) -from pathlib import Path -from anthology.utils import indent -from itertools import chain + last_name, first_name = args.name.split(",") if "," in args.name else (args.name, None) + + people = anthology.find_people(args.name) + if not people: + print(f"No person found matching name {args.name}") + + # find the person with the non-explicit ID + for person in people: + if not person.is_explicit: + break + print(f"Found person: {person}") + + if not person: + print(f"No person found matching name {args.name} with an explicit ID") + return + + for paper in person.papers(): + print("PAPER", paper.full_id) + authors = paper.get_editors() if paper.is_frontmatter else paper.authors + for author in authors: + if author.name in person.names: + print("-> Found", author) + author.id = args.id + # collection_paper_map[paper.collection_id].append(paper.full_id) -import lxml.etree as ET - - -def main(args): - for xml_file in Path(args.data_dir).glob("**/*.xml"): - changed_one = False - - tree = ET.parse(xml_file) - for paper_xml in chain( - tree.getroot().findall(".//paper"), tree.getroot().findall(".//meta") - ): - for author_xml in chain( - paper_xml.findall("./author"), paper_xml.findall("./editor") - ): - if "id" in author_xml.attrib: - continue - last_name = author_xml.find("./last").text - try: - first_name = author_xml.find("./first").text - except AttributeError: - first_name = "" - if last_name == args.last_name and first_name == args.first_name: - paper_id = ( - paper_xml.attrib["id"] if paper_xml.text == "paper" else "0" - ) - anth_id = f"{xml_file}/{paper_id}" - print(f"Adding {args.id} to {anth_id}...") - author_xml.attrib["id"] = args.id - changed_one = True - - if changed_one: - indent(tree.getroot()) - tree.write(xml_file, encoding="UTF-8", xml_declaration=True) + # save the anthology (doesn't currently work) + anthology.save_all() if __name__ == "__main__": - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser("Add an author ID to all of an author's papers") parser.add_argument("id", help="Author ID to add") - parser.add_argument("--last-name", help="Author's last name") - parser.add_argument("--first-name", help="Author's first name") - parser.add_argument("--confirm", action="store_true", help="Confirm each instance") + parser.add_argument("--name", "-n", help="Author's name (last[, first])") parser.add_argument( - "--data-dir", default=os.path.join(os.path.dirname(__file__), "..", "data", "xml") + "--data-dir", + default=None, + help="Path to anthology data directory (default: ../data relative to repository root)", ) args = parser.parse_args() + # Normalize data_dir to a Path string used by Anthology + # If the user supplies a path, trust it; otherwise compute relative to this script + if args.data_dir is None: + from pathlib import Path + args.data_dir = str(Path(__file__).parent.parent / "data") main(args) From 1f7741f19b0339dfd9b8e5d5a53bb0aa8543d45a Mon Sep 17 00:00:00 2001 From: Matt Post Date: Thu, 21 Aug 2025 07:46:18 -0400 Subject: [PATCH 11/17] Modify to partially use the module --- bin/add_author_id.py | 82 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 65 insertions(+), 17 deletions(-) diff --git a/bin/add_author_id.py b/bin/add_author_id.py index d3fbb077e7..16df9f7b54 100755 --- a/bin/add_author_id.py +++ b/bin/add_author_id.py @@ -2,12 +2,10 @@ # -*- coding: utf-8 -*- """Add an author ID to NameSpecification entries using the acl_anthology module. -This script finds author/editor name specifications matching a given -first/last name where no explicit ID is present, sets the provided ID, and -saves the affected collection XML files. +This script adds the name ID to all papers matching the first and last name. Usage: - ./add_author_id.py --last-name [--first-name ] [--data-dir ] + ./add_author_id.py "Last name[, First name]" """ from __future__ import annotations @@ -16,15 +14,19 @@ from collections import defaultdict from itertools import chain from typing import Set +from pathlib import Path from acl_anthology.anthology import Anthology -from acl_anthology.people import Name + +# old library since we're still editing XML files +from anthology.utils import indent +import lxml.etree as ET def main(args: argparse.Namespace) -> None: anthology = Anthology(args.data_dir, verbose=True) - last_name, first_name = args.name.split(",") if "," in args.name else (args.name, None) + last_name, first_name = args.name.split(", ") if ", " in args.name else (args.name, None) people = anthology.find_people(args.name) if not people: @@ -40,17 +42,64 @@ def main(args: argparse.Namespace) -> None: print(f"No person found matching name {args.name} with an explicit ID") return + # Build a collection of the set of papers to modify within each XML file + collection_to_paper_map = defaultdict(list) for paper in person.papers(): - print("PAPER", paper.full_id) - authors = paper.get_editors() if paper.is_frontmatter else paper.authors - for author in authors: - if author.name in person.names: - print("-> Found", author) - author.id = args.id - # collection_paper_map[paper.collection_id].append(paper.full_id) - - # save the anthology (doesn't currently work) - anthology.save_all() + collection_to_paper_map[paper.collection_id].append(paper.full_id_tuple) + + print(collection_to_paper_map) + + # Now iterate over those files and the papers within them + for collection_id, paper_id_tuples in collection_to_paper_map.items(): + xml_file = Path(args.data_dir) / "xml" / f"{collection_id}.xml" + + tree = ET.parse(xml_file) + + for paper_tuple in paper_id_tuples: + _, volume_id, paper_id = paper_tuple + + # Get the paper + paper_xml = tree.getroot().find(f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") + + for author_xml in chain( + paper_xml.findall("./author"), paper_xml.findall("./editor") + ): + if "id" in author_xml.attrib: + continue + try: + author_first_name = author_xml.find("./first").text + except AttributeError: + author_first_name = None + author_last_name = author_xml.find("./last").text + + print("Found", first_name, last_name) + + if author_last_name == last_name and author_first_name == first_name: + paper_id = ( + paper_xml.attrib["id"] if paper_xml.text == "paper" else "0" + ) + anth_id = f"{xml_file}/{paper_id}" + print(f"Adding {args.id} to {anth_id}...") + author_xml.attrib["id"] = args.id + + indent(tree.getroot()) + tree.write(xml_file, encoding="UTF-8", xml_declaration=True) + + """ + Once we have the module published, we should be able to modify this to use + it to write the changed XML files, instead of the above. + """ + # for paper in person.papers(): + # print("PAPER", paper.full_id) + # authors = paper.get_editors() if paper.is_frontmatter else paper.authors + # for author in authors: + # if author.name in person.names: + # print("-> Found", author) + # author.id = args.id + # # collection_paper_map[paper.collection_id].append(paper.full_id) + + # # save the anthology (doesn't currently work) + # anthology.save_all() if __name__ == "__main__": @@ -66,7 +115,6 @@ def main(args: argparse.Namespace) -> None: # Normalize data_dir to a Path string used by Anthology # If the user supplies a path, trust it; otherwise compute relative to this script if args.data_dir is None: - from pathlib import Path args.data_dir = str(Path(__file__).parent.parent / "data") main(args) From e8f3d5b26b93f890a8975f07ad6b9598d7251775 Mon Sep 17 00:00:00 2001 From: Matt Post Date: Thu, 21 Aug 2025 07:58:32 -0400 Subject: [PATCH 12/17] Add --paper-ids argument --- bin/add_author_id.py | 59 +++++++++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 23 deletions(-) diff --git a/bin/add_author_id.py b/bin/add_author_id.py index 16df9f7b54..a152dcc2b4 100755 --- a/bin/add_author_id.py +++ b/bin/add_author_id.py @@ -3,9 +3,11 @@ """Add an author ID to NameSpecification entries using the acl_anthology module. This script adds the name ID to all papers matching the first and last name. +It will use the module to find the list of papers to edit. Alternately, you +provide it with the list of papers. Usage: - ./add_author_id.py "Last name[, First name]" + ./add_author_id.py "Last name[, First name]" [--paper-ids 2028.acl-main.74 ...] """ from __future__ import annotations @@ -24,30 +26,42 @@ def main(args: argparse.Namespace) -> None: - anthology = Anthology(args.data_dir, verbose=True) last_name, first_name = args.name.split(", ") if ", " in args.name else (args.name, None) - people = anthology.find_people(args.name) - if not people: - print(f"No person found matching name {args.name}") - - # find the person with the non-explicit ID - for person in people: - if not person.is_explicit: - break - print(f"Found person: {person}") - - if not person: - print(f"No person found matching name {args.name} with an explicit ID") - return + anthology = Anthology(args.data_dir, verbose=True) # Build a collection of the set of papers to modify within each XML file collection_to_paper_map = defaultdict(list) - for paper in person.papers(): - collection_to_paper_map[paper.collection_id].append(paper.full_id_tuple) - print(collection_to_paper_map) + if args.paper_ids: + for paper_id in args.paper_ids: + paper = anthology.get_paper(paper_id) + if paper: + collection_to_paper_map[paper.collection_id].append(paper.full_id_tuple) + + else: + people = anthology.find_people(args.name) + if not people: + print(f"No person found matching name {args.name}") + + # find the person with the non-explicit ID + for person in people: + if not person.is_explicit: + break + + if not person: + print(f"No person found matching name {args.name} with an explicit ID") + return + + for paper in person.papers(): + collection_to_paper_map[paper.collection_id].append(paper.full_id_tuple) + + if collection_to_paper_map: + print(f"Will edit the following paper IDs:") + for paper_id_tuples in collection_to_paper_map.values(): + for paper_id in paper_id_tuples: + print(f" - {paper_id}") # Now iterate over those files and the papers within them for collection_id, paper_id_tuples in collection_to_paper_map.items(): @@ -72,14 +86,12 @@ def main(args: argparse.Namespace) -> None: author_first_name = None author_last_name = author_xml.find("./last").text - print("Found", first_name, last_name) - if author_last_name == last_name and author_first_name == first_name: paper_id = ( paper_xml.attrib["id"] if paper_xml.text == "paper" else "0" ) - anth_id = f"{xml_file}/{paper_id}" - print(f"Adding {args.id} to {anth_id}...") + paper_id = anthology.get_paper(paper_tuple).full_id + print(f"Adding {args.id} to {author_first_name} {author_last_name} on paper {paper_id}...") author_xml.attrib["id"] = args.id indent(tree.getroot()) @@ -105,7 +117,8 @@ def main(args: argparse.Namespace) -> None: if __name__ == "__main__": parser = argparse.ArgumentParser("Add an author ID to all of an author's papers") parser.add_argument("id", help="Author ID to add") - parser.add_argument("--name", "-n", help="Author's name (last[, first])") + parser.add_argument("name", help="Author's name (last[, first])") + parser.add_argument("--paper-ids", nargs="*", help="List of paper IDs to modify") parser.add_argument( "--data-dir", default=None, From 2c9bf4f3dbcd0ed6667369d40166fc06c11e7044 Mon Sep 17 00:00:00 2001 From: Matt Post Date: Thu, 21 Aug 2025 11:08:05 -0400 Subject: [PATCH 13/17] black --- bin/add_author_id.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/bin/add_author_id.py b/bin/add_author_id.py index a152dcc2b4..0961b450cc 100755 --- a/bin/add_author_id.py +++ b/bin/add_author_id.py @@ -15,7 +15,6 @@ import argparse from collections import defaultdict from itertools import chain -from typing import Set from pathlib import Path from acl_anthology.anthology import Anthology @@ -27,7 +26,9 @@ def main(args: argparse.Namespace) -> None: - last_name, first_name = args.name.split(", ") if ", " in args.name else (args.name, None) + last_name, first_name = ( + args.name.split(", ") if ", " in args.name else (args.name, None) + ) anthology = Anthology(args.data_dir, verbose=True) @@ -58,7 +59,7 @@ def main(args: argparse.Namespace) -> None: collection_to_paper_map[paper.collection_id].append(paper.full_id_tuple) if collection_to_paper_map: - print(f"Will edit the following paper IDs:") + print("Will edit the following paper IDs:") for paper_id_tuples in collection_to_paper_map.values(): for paper_id in paper_id_tuples: print(f" - {paper_id}") @@ -73,7 +74,9 @@ def main(args: argparse.Namespace) -> None: _, volume_id, paper_id = paper_tuple # Get the paper - paper_xml = tree.getroot().find(f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") + paper_xml = tree.getroot().find( + f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']" + ) for author_xml in chain( paper_xml.findall("./author"), paper_xml.findall("./editor") @@ -91,7 +94,9 @@ def main(args: argparse.Namespace) -> None: paper_xml.attrib["id"] if paper_xml.text == "paper" else "0" ) paper_id = anthology.get_paper(paper_tuple).full_id - print(f"Adding {args.id} to {author_first_name} {author_last_name} on paper {paper_id}...") + print( + f"Adding {args.id} to {author_first_name} {author_last_name} on paper {paper_id}..." + ) author_xml.attrib["id"] = args.id indent(tree.getroot()) @@ -99,7 +104,7 @@ def main(args: argparse.Namespace) -> None: """ Once we have the module published, we should be able to modify this to use - it to write the changed XML files, instead of the above. + it to write the changed XML files, instead of the above. """ # for paper in person.papers(): # print("PAPER", paper.full_id) From dbcdcae51748cdd7e97c3512ce80e63165b9a28e Mon Sep 17 00:00:00 2001 From: Matt Post Date: Thu, 21 Aug 2025 11:16:01 -0400 Subject: [PATCH 14/17] Update instructions to match script changes --- .../process-author-page.instructions.md | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/.github/instructions/process-author-page.instructions.md b/.github/instructions/process-author-page.instructions.md index 0bb50cfe73..19185dca44 100644 --- a/.github/instructions/process-author-page.instructions.md +++ b/.github/instructions/process-author-page.instructions.md @@ -89,7 +89,7 @@ Use the `bin/add_author_id.py` script to efficiently add the ID to all papers be ```bash # Add ID to all papers by the author's first and last name -bin/add_author_id.py matt-post-rochester --first-name "Matt" --last-name "Post" +bin/add_author_id.py matt-post-rochester "Post, Matt" --paper-ids [list of Anthology paper ids] ``` This will add the `id` attribute to matching `` tags: @@ -119,13 +119,13 @@ Then, use the `bin/add_author_id.py` script to efficiently add the ID to all unt ```bash # Add ID to all papers by the author's first and last name -bin/add_author_id.py matt-post --first-name "Matt" --last-name "Post" +bin/add_author_id.py matt-post "Post, Matt" ``` ### Helper Tools -- `bin/add_author_id.py author-id --last-name "LastName"` - Bulk add ID to matching authors -- `bin/add_explicit_author_id.py` - Add IDs based on existing disambiguation +- `bin/add_author_id.py author-id "Last name, first name"` - Bulk add ID to matching authors +- `bin/add_author_id.py author-id "Last name, first name" --paper-ids ...` - Bulk add ID to matching authors to specific papers (to prevent over-matching on the author name) ## Validation & Testing @@ -155,7 +155,7 @@ make hugo_data ### Merge Example ```yaml -# Merging "John P. Smith" and "John Smith" +# Merging "John P. Smith" and "John Smith" - canonical: {first: John P., last: Smith} orcid: 0000-0002-1234-5678 institution: Stanford University @@ -221,10 +221,9 @@ For each paper belonging to the disambiguated author, add `id` attribute to XML: - Use existing XML formatting tools to maintain consistency **Tools available**: -- `bin/add_author_id.py author-id --last-name "LastName"` - Bulk add ID to author -- `bin/add_explicit_author_id.py` - Add IDs based on existing disambiguation +- `bin/add_author_id.py author-id "Last name, first name"` - Bulk add ID to author -#### 4.3 Handle Remaining Papers +#### 4.3 Handle Remaining Papers For papers that don't belong to the author with the explicit ID: 1. **Option A**: Leave unchanged (they remain under generic ID) 2. **Option B**: Create another explicit ID for the other author if requested @@ -242,7 +241,7 @@ If multiple authors have similar names, add `similar` field: ### Author ID Format - **Structure**: `firstname-lastname-institution` -- **Rules**: +- **Rules**: - Lowercase only - Hyphens replace spaces and special characters - Institution should be recognizable abbreviation @@ -285,7 +284,7 @@ make site - **Name variants**: `data/yaml/name_variants.yaml` - **XML metadata**: `data/xml/{collection}.xml` (e.g., `2020.acl.xml`) - **Validation script**: `make check` -- **Author ID tools**: `bin/add_author_id.py`, `bin/add_explicit_author_id.py` +- **Author ID tools**: `bin/add_author_id.py` ## Examples @@ -299,7 +298,7 @@ make site - {first: J. P., last: Smith} ``` -### Split Example +### Split Example ```yaml # Splitting "Yang Liu" into institution-specific profiles - canonical: {first: Yang, last: Liu} From 35fd153a60b7db78db9dec54dc80a2f5c41e4fa2 Mon Sep 17 00:00:00 2001 From: Matt Post Date: Thu, 21 Aug 2025 11:52:25 -0400 Subject: [PATCH 15/17] Update prompt files --- .github/copilot-instructions.md | 10 +- .../process-author-page.prompt.md | 162 ++++++++++++++++++ 2 files changed, 167 insertions(+), 5 deletions(-) create mode 100644 .github/instructions/process-author-page.prompt.md diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 2ae96e8d5b..3d5e3e32b9 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -13,7 +13,7 @@ The ACL Anthology is a digital archive of NLP/CL research papers with both a sta ### Build Process Pipeline 1. **XML Processing**: `bin/create_hugo_data.py` converts XML → JSON for Hugo templates -2. **Bibliography Generation**: `bin/create_extra_bib.py` creates BibTeX/MODS/Endnote exports +2. **Bibliography Generation**: `bin/create_extra_bib.py` creates BibTeX/MODS/Endnote exports 3. **Hugo Site Generation**: Hugo processes JSON data → static HTML site 4. **Asset Management**: PDF files, attachments managed separately with checksums @@ -31,7 +31,7 @@ Key build targets in `Makefile`: - **VOLUME**: volume name (`main`, `short`, `1`, etc.) - **NUMBER**: paper number within volume -### Legacy Format (pre-2020) +### Legacy Format (pre-2020) - Various letter-based schemes (P19-1234, W19-5012, etc.) - Limited paper capacity, inflexible venue encoding @@ -54,7 +54,7 @@ Key build targets in `Makefile`: # Python package tests cd python && poetry run pytest -# Full site build test +# Full site build test make check site # Integration tests on full data @@ -71,7 +71,7 @@ pytest -m integration ### Script Naming Conventions - `add_*.py` - Add new metadata fields - `fix_*.py` - Correct existing data -- `ingest_*.py` - Import data from external sources +- `ingest_*.py` - Import data from external sources - `create_*.py` - Generate derived files ### Hugo Data Export Pattern @@ -129,5 +129,5 @@ make all ## File Locations for Common Tasks - **Add new venue**: `data/yaml/venues/{venue-id}.yaml` - **Fix paper metadata**: Edit `data/xml/{collection}.xml` directly -- **Modify site templates**: `hugo/layouts/` +- **Modify site templates**: `hugo/layouts/` - **Update build process**: `Makefile` and `bin/create_hugo_data.py` diff --git a/.github/instructions/process-author-page.prompt.md b/.github/instructions/process-author-page.prompt.md new file mode 100644 index 0000000000..c89f2e148f --- /dev/null +++ b/.github/instructions/process-author-page.prompt.md @@ -0,0 +1,162 @@ +# Prompt template: Process an author-page GitHub issue + +Purpose +------- +This prompt is for automating the `process-author-page` workflow. Give an LLM (or automation) a full GitHub issue (title, body, labels, comments) and it will extract the data needed to fill out the project's author-page instructions and produce a machine-friendly plan and artifacts (YAML snippet, XML edit hints, branch name, commands, PR text, and clarifying questions). + +How to use +---------- +- Provide the full issue object as context: `issue.title`, `issue.body`, `issue.labels`, `issue.author`, `issue.comments` (list of {author, body, created_at}). +- Expect a single JSON object output exactly matching the schema in the "Output schema" section. + +Prompt (give the following to the LLM as the user/system prompt): + +"Process an author-page GitHub issue and produce a complete actionable plan" + +Context you will receive (pass this as context): +- issue.title (string) +- issue.body (string) +- issue.labels (list of strings) +- issue.author (string) +- issue.comments (list of {author, body, created_at}) +- optional: linked PR / linked commits + +Task for the LLM +---------------- +1. Parse the issue and comments to extract: + - canonical_author_name: canonical first/middle/last parts. + - name_variants mentioned in issue/comments. + - requester_author_id (if suggested by user). + - requester_ORCID (if provided). + - requester_institution (if provided). + - primary_paper_ids: Anthology paper IDs the requester claims. + - other_paper_ids: other Anthology IDs referencing the same name. + - requested_action: one of ["create-id-and-assign","assign-existing-id","split","merge","other"], or "clarify" if ambiguous. + - whether the user requests a dummy id for other people sharing the name. + - urgency / labels like "author-page" / "high-priority". + +2. Validate and enrich: + - Validate ORCID format (pattern: 0000-0000-0000-0000). + - Validate Anthology ID patterns; if missing set papers_to_verify=true. + - If ambiguous or missing data, populate `clarifying_questions` with concise questions. + +3. Produce outputs using the exact JSON schema below. Be concise and machine-parseable. When generating branch names and ids, follow repository conventions described in guidelines. + +Output schema (RETURN EXACTLY this JSON object; do not return extra text) +-------------------------------------------------------------------------------- +{ + "metadata": { + "issue_title": string, + "issue_number": integer_or_null, + "issue_author": string, + "labels": [string] + }, + + "extracted": { + "canonical_name": { "first": string, "middle": string_or_null, "last": string }, + "name_variants": [string], + "requester": { + "author_id_proposed": string_or_null, + "orcid": string_or_null, + "institution": string_or_null, + "claim_paper_ids": [string] + }, + "other_paper_ids": [ { "id": string, "found_in_comment_or_body": string } ], + "requested_action": "create-id-and-assign" | "assign-existing-id" | "split" | "merge" | "other" | "clarify", + "wants_dummy_id": boolean, + "ambiguities": [string] + }, + + "plan": { + "branch_name": string, + "name_variants_yaml_snippet": string, + "xml_edits": [ + { "paper_id": string, "file_hint": string_or_null, "author_xpath_hint": string, "action": "add_id" | "remove_id" | "none", "id_to_set": string } + ], + "commands": [ string ], + "git": { + "commit_message": string, + "pr_title": string, + "pr_body": string + }, + "validation_commands": [ string ], + "files_to_edit": [string], + "notes": [string] + }, + + "edge_cases_and_questions": { + "clarifying_questions": [string], + "recommended_dummy_id_format": string, + "conflict_resolution_policy": string + } +} + +Guidelines and conventions (apply when filling fields) +------------------------------------------------------ +- Always use `data/yaml/name_variants.yaml` for new canonical id entries. The YAML snippet must follow existing project structure. Example: + + - canonical: {first: Shashank, last: Gupta} + id: shashank-gupta-uiuc + orcid: 0000-0000-0000-0000 + institution: University of Illinois at Urbana-Champaign + comment: "created from issue #NNN: author-confirmed" + +- Shell commands must be repository-root relative and follow this example order: + - git checkout -b + - python3 bin/add_author_id.py "Last, First" --paper-ids + - git add + - git commit -m "" + - git push --set-upstream origin + +- Include validation commands: `make check` and `make hugo_data` in `validation_commands`. +- If ambiguous or missing paper IDs, set `requested_action` to "clarify" and include exact clarifying questions for the issue author. + +- ID & branch generation policy: + - Prefer supplied author id. If none supplied, generate `last-first` (lowercase, ascii, hyphenated). + - If that collides with an existing id, append an institution shortname (e.g., `-uiuc`) or year suffix (e.g., `-2025`). + +Edge cases to handle (list these briefly in `edge_cases_and_questions`): +- Issue requests adding an id but provides no Anthology paper IDs. +- Multiple people share the canonical name across years and venues. +- ORCID present but invalid format. +- A user requests merging two existing ids (detect and set `requested_action`="merge"). +- Concurrent edits: warn to re-open `data/yaml/name_variants.yaml` before editing to avoid overwrites. + +Minimal example (illustrative only; real output must derive from the issue): + +{ + "metadata": { "issue_title": "Author page: Shashank Gupta", "issue_number": 3658, "issue_author": "shashank", "labels": ["author-page"] }, + "extracted": { + "canonical_name": {"first":"Shashank","middle":null,"last":"Gupta"}, + "name_variants": ["Gupta, Shashank"], + "requester": {"author_id_proposed":"shashank-gupta-uiuc","orcid":"0000-0002-3683-3739","institution":"University of Illinois at Urbana-Champaign","claim_paper_ids":["L18-1086"]}, + "other_paper_ids": [{"id":"2020.semeval-1.56","found_in_comment_or_body":"comment by alice"}], + "requested_action":"create-id-and-assign", + "wants_dummy_id": true, + "ambiguities": [] + }, + "plan": { + "branch_name":"author-page-shashank-gupta-uiuc", + "name_variants_yaml_snippet":"- canonical: {first: Shashank, last: Gupta}\\n id: shashank-gupta-uiuc\\n orcid: 0000-0002-3683-3739\\n institution: University of Illinois at Urbana-Champaign\\n comment: \\\"from issue #3658\\\"", + "xml_edits": [{"paper_id":"L18-1086","file_hint":"data/xml/L18.xml","author_xpath_hint":"//paper[@id='L18-1086']//author[first='Shashank' and last='Gupta']","action":"add_id","id_to_set":"shashank-gupta-uiuc"}], + "commands":["git checkout -b author-page-shashank-gupta-uiuc","python3 bin/add_author_id.py shashank-gupta-uiuc \"Gupta, Shashank\" --paper-ids L18-1086","git add data/xml/L18.xml data/yaml/name_variants.yaml","git commit -m \"Author page: add shashank-gupta-uiuc and assign L18-1086\\n\\nCloses #3658\"","git push --set-upstream origin author-page-shashank-gupta-uiuc"], + "git": {"commit_message":"Author page: add shashank-gupta-uiuc and assign L18-1086\\n\\nCloses #3658","pr_title":"Author page: Shashank Gupta (shashank-gupta-uiuc)","pr_body":"This PR creates author id `shashank-gupta-uiuc` and assigns it to L18-1086. Also adds a `name_variants` YAML entry. See #3658."}, + "validation_commands": ["make check","make hugo_data"], + "files_to_edit":["data/yaml/name_variants.yaml","data/xml/L18.xml"], + "notes":["Re-open `data/yaml/name_variants.yaml` before applying the YAML snippet to avoid overwriting manual edits."] + }, + "edge_cases_and_questions": { + "clarifying_questions": [], + "recommended_dummy_id_format":"last-first", + "conflict_resolution_policy":"If generated id collides, append institution shortname; if still ambiguous, append year suffix and ask the issue author to confirm." + } +} + +Usage notes +----------- +- Feed the full issue (title, body, comments) into this prompt and request exactly one JSON object output. +- If `requested_action` == "clarify", post the `clarifying_questions` as a comment on the issue before making edits. +- Always re-open `data/yaml/name_variants.yaml` to read current contents before applying the `name_variants_yaml_snippet`. +- Run validation commands after edits. + +-- end of prompt template From 7d5c16b31b7688ab2e3041c21062d518dddf5132 Mon Sep 17 00:00:00 2001 From: Matt Post Date: Thu, 21 Aug 2025 12:58:56 -0400 Subject: [PATCH 16/17] Clean up instructions --- .../process-author-page.instructions.md | 128 +++++++----------- 1 file changed, 48 insertions(+), 80 deletions(-) diff --git a/.github/instructions/process-author-page.instructions.md b/.github/instructions/process-author-page.instructions.md index 19185dca44..4bc051c732 100644 --- a/.github/instructions/process-author-page.instructions.md +++ b/.github/instructions/process-author-page.instructions.md @@ -29,13 +29,13 @@ All author page requests **must** include: git checkout master git pull origin master -# Create branch using the pattern: author-page-{author_id} +# Create branch using the pattern: author-page-{author_id} where author_id is the unique identifier for the author git checkout -b author-page-{author_id} ``` **Branch naming examples**: -- Merge: `author-page-matt-post` -- Split: `author-page-matt-post-rochester` +- `author-page-matt-post`: often used when merging multiple pages under a single canonical name variant +- `author-page-matt-post-rochester`: used when we need to split a page, disambiguating one author (using their institution) from others ## Request Type 1: Merging Author Pages @@ -58,6 +58,7 @@ git checkout -b author-page-{author_id} - Canonical name should be the author's preferred variant - Include all name variants found in the XML files - The `institution` field should be included for future use + - There is no need to list the canonical version under the variants list ## Request Type 2: Splitting Author Pages @@ -67,59 +68,79 @@ git checkout -b author-page-{author_id} ### Steps: -#### 2.1 Create Author ID for Requester +#### 2.1 Create a base Author ID for all the names + +First, add a "generic" entry to `data/yaml/name_variants.yaml`. For example: -Add entry to `data/yaml/name_variants.yaml`: ```yaml - canonical: {first: Matt, last: Post} - id: matt-post-rochester # Format: firstname-lastname-institution - orcid: 0000-0000-0000-0000 - institution: University of Rochester + id: matt-post + comment: "May refer to several people" ``` -**ID format rules**: -- Lowercase only -- Hyphens replace spaces -- Use recognizable institution abbreviation -- Examples: `yang-liu-umich`, `john-smith-stanford`, `jane-doe-google` +This should be added roughly sorted into the YAML file. This helps avoid merge conflicts, +if multiple authors are processed independently at the same time. -#### 2.2 Tag Author's Papers +#### 2.3 Tag all authors with that name string using the tag. -Use the `bin/add_author_id.py` script to efficiently add the ID to all papers belonging to the requester: +Use the `bin/add_author_id.py` script to efficiently add the ID to all papers that have this author name. +Continuing with our "matt-post" example: ```bash # Add ID to all papers by the author's first and last name -bin/add_author_id.py matt-post-rochester "Post, Matt" --paper-ids [list of Anthology paper ids] +bin/add_author_id.py matt-post "Post, Matt" ``` -This will add the `id` attribute to matching `` tags: +This will add the `id` attribute to matching `` tags. For example, this entry ```xml MattPost +will become this: + -MattPost +MattPost ``` **Note**: The script automatically maintains proper XML formatting and preserves indentation. -#### 2.3 Handle Remaining Papers +#### 2.4 Create an Author ID for the Requester -For papers that don't belong to the requester (the "other" Matt Post): +Now that all names are tagged, we want to select out those of the request and tag them with a new ID. -If there is no entry in the YAML file, create one. +First, add an entry to `data/yaml/name_variants.yaml`: ```yaml - canonical: {first: Matt, last: Post} - id: matt-post - comment: "May refer to several people" + id: matt-post-rochester # Format: firstname-lastname-institution + orcid: 0000-0000-0000-0000 + institution: University of Rochester ``` -Then, use the `bin/add_author_id.py` script to efficiently add the ID to all untagged papers: +**ID format rules**: +- Lowercase only +- Hyphens replace spaces +- Use recognizable institution abbreviation +- Examples: `yang-liu-umich`, `john-smith-stanford`, `jane-doe-google` + +#### 2.5 Tag Author's Papers + +Use the `bin/add_author_id.py` script again, but this time with the `--paper-ids` flag. ```bash # Add ID to all papers by the author's first and last name -bin/add_author_id.py matt-post "Post, Matt" +bin/add_author_id.py matt-post-rochester "Post, Matt" --paper-ids [list of Anthology paper ids] +``` + +This will change the `id` attribute from the generic one to the specific one for the +requesting author: + +```xml + +MattPost + + +MattPost ``` ### Helper Tools @@ -134,9 +155,6 @@ bin/add_author_id.py matt-post "Post, Matt" ```bash # Validate XML schema compliance make check - -# Test data generation -make hugo_data ``` ### Common Issues to Avoid @@ -187,9 +205,7 @@ make hugo_data git add data/yaml/name_variants.yaml data/xml/*.xml # Commit with reference to issue number -git commit -m "Process author page request for {Author Name} - -Closes #{issue_number} +git commit -m "Process author page request for {Author Name} (closes #{issue_number}) - {Brief description of changes made} " @@ -200,7 +216,7 @@ git push origin author-page-{author_id} ### 2. Create Pull Request -- **Title**: `Author page: {Author Name} ({merge|split})` +- **Title**: `Author page: {Author Name}` - **Body**: Reference the GitHub issue number and summarize changes - **Labels**: Add appropriate labels (`author-page`, `merge` or `split`) @@ -223,19 +239,6 @@ For each paper belonging to the disambiguated author, add `id` attribute to XML: **Tools available**: - `bin/add_author_id.py author-id "Last name, first name"` - Bulk add ID to author -#### 4.3 Handle Remaining Papers -For papers that don't belong to the author with the explicit ID: -1. **Option A**: Leave unchanged (they remain under generic ID) -2. **Option B**: Create another explicit ID for the other author if requested - -#### 4.4 Update Similar Authors (if applicable) -If multiple authors have similar names, add `similar` field: -```yaml -- canonical: {first: Yang, last: Liu} - id: yang-liu-umich - orcid: 0000-0000-0000-0000 - similar: [yang-liu-edinburgh, yang-liu-pk] -``` ## ID Generation Rules @@ -259,12 +262,6 @@ Common patterns: ```bash # Validate XML schema compliance make check - -# Test site generation with changes -make hugo_data - -# Full build test (if making significant changes) -make site ``` ### Formatting Consistency @@ -286,35 +283,6 @@ make site - **Validation script**: `make check` - **Author ID tools**: `bin/add_author_id.py` -## Examples - -### Merge Example -```yaml -# Merging "John P. Smith" and "John Smith" profiles -- canonical: {first: John P., last: Smith} - orcid: 0000-0002-1234-5678 - variants: - - {first: John, last: Smith} - - {first: J. P., last: Smith} -``` - -### Split Example -```yaml -# Splitting "Yang Liu" into institution-specific profiles -- canonical: {first: Yang, last: Liu} - id: yang-liu-umich - orcid: 0000-0003-1234-5678 - comment: University of Michigan - similar: [yang-liu-edinburgh] -``` - -With corresponding XML updates: -```xml -YangLiu -``` - -**Note**: Maintain single-line format for author tags as shown above. - ## Post-Processing ### 1. Commit and Push Changes From e21ef7f41cbc80b9e5f5b2717256055e3f940ab2 Mon Sep 17 00:00:00 2001 From: Matt Post Date: Thu, 21 Aug 2025 20:55:04 -0400 Subject: [PATCH 17/17] Update author instructions --- .../process-author-page.instructions.md | 65 +++++++++++++++++-- 1 file changed, 60 insertions(+), 5 deletions(-) diff --git a/.github/instructions/process-author-page.instructions.md b/.github/instructions/process-author-page.instructions.md index 0bb50cfe73..c422b5a135 100644 --- a/.github/instructions/process-author-page.instructions.md +++ b/.github/instructions/process-author-page.instructions.md @@ -9,9 +9,45 @@ This guide provides instructions for processing GitHub issues requesting author ## Prerequisites & Requirements All author page requests **must** include: +- **GitHub issue number** (e.g., `#123`) +- **The author ID** (e.g., `matt-post` or `matt-post-rochester`) - **Valid ORCID ID** (format: `0000-0000-0000-0000`) - **Institution** where highest (anticipated) degree was/will be obtained -- **Clear identification** of which papers belong to the author +- **Requested action** (merge or split) +- **Clear identification** of which papers belong to the author (in the case of a split) + +Ideally, this input will be in the form of a JSON object. Here is an example input for merging: + +```json +{ + "github_issue": "#123", + "canonical": "Post, Matt", + "variants": [ + "Post, Matthew", + "Post, Matthew J" + ], + "author_id": "matt-post", + "orcid": "0000-0000-0000-0000", + "institution": "University of Rochester", + "action": "merge", +} +``` + +and for splitting: + +```json +{ + "github_issue": "#123", + "author_id": "matt-post-rochester", + "orcid": "0000-0000-0000-0000", + "institution": "University of Rochester", + "action": "split", + "papers": [ + "2024.acl-main.17", + "2018.wmt-1.67" + ] +} +``` ## Workflow Overview @@ -54,10 +90,29 @@ git checkout -b author-page-{author_id} - {first: Matthew, last: Post} ``` -2. **Important notes**: - - Canonical name should be the author's preferred variant - - Include all name variants found in the XML files - - The `institution` field should be included for future use +2. **Check out the branch, merging off master**: + +```bash +# Ensure master is up to date +git checkout master +git pull origin master + +# Create branch using the pattern: author-page-{author_id} +git checkout -b author-page-{author_id} +``` + +3. **Commit to the branch, noting the Github issue being closed** + +```bash +git add data/yaml/name_variants.yaml +git commit -m "Merging author pages for {author_name} (closes #{issue_number})" +``` + +**Important notes**: +- Canonical name should be the author's preferred variant +- Include all name variants found in the XML files +- The `institution` field should be included for future use +- Do not create an `id` field (this is only for splitting) ## Request Type 2: Splitting Author Pages