|
1 | 1 | #!/usr/bin/env python3 |
2 | | -# -*- coding: utf-8 -*- |
3 | | -# |
4 | | -# Copyright 2022 Matt Post <[email protected]> |
5 | | -# |
6 | | -# Licensed under the Apache License, Version 2.0 (the "License"); |
7 | | -# you may not use this file except in compliance with the License. |
8 | | -# You may obtain a copy of the License at |
9 | | -# |
10 | | -# http://www.apache.org/licenses/LICENSE-2.0 |
11 | | -# |
12 | | -# Unless required by applicable law or agreed to in writing, software |
13 | | -# distributed under the License is distributed on an "AS IS" BASIS, |
14 | | -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
15 | | -# See the License for the specific language governing permissions and |
16 | | -# limitations under the License. |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | +"""Add an author ID to NameSpecification entries using the acl_anthology module. |
17 | 4 |
|
18 | | -""" |
19 | | -Adds an ID tag to all instances of an author in all XML files where there is no ID tag. |
20 | | -
|
21 | | -First use case was the Bill Byrne separation of July 2022. |
22 | | -
|
23 | | -2020.gebnlp-1.4 E14-1026 E14-1028 W16-2324 2021.acl-long.55 2021.eancs-1.2 W15-0116 D19-1125 D19-1331 D19-1459 P14-3000 2022.naacl-main.136 W18-1821 W18-5420 W18-6427 2020.nlp4call-1.2 N19-1406 2021.emnlp-main.620 2021.emnlp-main.666 N18-2081 N18-3013 W17-3531 2020.wmt-1.94 D15-1273 2022.nlp4convai-1.7 P16-2049 C14-1195 P19-1022 W19-4417 W19-4424 W19-5340 W19-5421 2020.wat-1.21 E17-2058 2022.ecnlp-1.13 J14-3008 N15-1041 N15-1105 P18-2051 D17-1208 D17-1220 D17-2005 2020.acl-main.690 2020.acl-main.693 N16-1100 2022.findings-acl.223 2022.findings-acl.301 |
| 5 | +This script adds the name ID to all papers matching the first and last name. |
| 6 | +It will use the module to find the list of papers to edit. Alternately, you |
| 7 | +provide it with the list of papers. |
24 | 8 |
|
25 | 9 | Usage: |
26 | | -
|
27 | | - ./add_author_id.py bill-byrne --last-name Byrne --first-name Bill |
| 10 | + ./add_author_id.py <id> "Last name[, First name]" [--paper-ids 2028.acl-main.74 ...] |
28 | 11 | """ |
29 | 12 |
|
30 | | -import argparse |
31 | | -import os |
| 13 | +from __future__ import annotations |
32 | 14 |
|
33 | | -from pathlib import Path |
34 | | -from anthology.utils import indent |
| 15 | +import argparse |
| 16 | +from collections import defaultdict |
35 | 17 | from itertools import chain |
| 18 | +from pathlib import Path |
| 19 | + |
| 20 | +from acl_anthology.anthology import Anthology |
36 | 21 |
|
| 22 | +# old library since we're still editing XML files |
| 23 | +from anthology.utils import indent |
37 | 24 | import lxml.etree as ET |
38 | 25 |
|
39 | 26 |
|
40 | | -def main(args): |
41 | | - for xml_file in Path(args.data_dir).glob("**/*.xml"): |
42 | | - changed_one = False |
| 27 | +def main(args: argparse.Namespace) -> None: |
| 28 | + |
| 29 | + last_name, first_name = ( |
| 30 | + args.name.split(", ") if ", " in args.name else (args.name, None) |
| 31 | + ) |
| 32 | + |
| 33 | + anthology = Anthology(args.data_dir, verbose=True) |
| 34 | + |
| 35 | + # Build a collection of the set of papers to modify within each XML file |
| 36 | + collection_to_paper_map = defaultdict(list) |
| 37 | + |
| 38 | + if args.paper_ids: |
| 39 | + for paper_id in args.paper_ids: |
| 40 | + paper = anthology.get_paper(paper_id) |
| 41 | + if paper: |
| 42 | + collection_to_paper_map[paper.collection_id].append(paper.full_id_tuple) |
| 43 | + |
| 44 | + else: |
| 45 | + people = anthology.find_people(args.name) |
| 46 | + if not people: |
| 47 | + print(f"No person found matching name {args.name}") |
| 48 | + |
| 49 | + # find the person with the non-explicit ID |
| 50 | + for person in people: |
| 51 | + if not person.is_explicit: |
| 52 | + break |
| 53 | + |
| 54 | + if not person: |
| 55 | + print(f"No person found matching name {args.name} with an explicit ID") |
| 56 | + return |
| 57 | + |
| 58 | + for paper in person.papers(): |
| 59 | + collection_to_paper_map[paper.collection_id].append(paper.full_id_tuple) |
| 60 | + |
| 61 | + if collection_to_paper_map: |
| 62 | + print("Will edit the following paper IDs:") |
| 63 | + for paper_id_tuples in collection_to_paper_map.values(): |
| 64 | + for paper_id in paper_id_tuples: |
| 65 | + print(f" - {paper_id}") |
| 66 | + |
| 67 | + # Now iterate over those files and the papers within them |
| 68 | + for collection_id, paper_id_tuples in collection_to_paper_map.items(): |
| 69 | + xml_file = Path(args.data_dir) / "xml" / f"{collection_id}.xml" |
43 | 70 |
|
44 | 71 | tree = ET.parse(xml_file) |
45 | | - for paper_xml in chain( |
46 | | - tree.getroot().findall(".//paper"), tree.getroot().findall(".//meta") |
47 | | - ): |
| 72 | + |
| 73 | + for paper_tuple in paper_id_tuples: |
| 74 | + _, volume_id, paper_id = paper_tuple |
| 75 | + |
| 76 | + # Get the paper |
| 77 | + paper_xml = tree.getroot().find( |
| 78 | + f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']" |
| 79 | + ) |
| 80 | + |
48 | 81 | for author_xml in chain( |
49 | 82 | paper_xml.findall("./author"), paper_xml.findall("./editor") |
50 | 83 | ): |
51 | 84 | if "id" in author_xml.attrib: |
52 | 85 | continue |
53 | | - last_name = author_xml.find("./last").text |
54 | 86 | try: |
55 | | - first_name = author_xml.find("./first").text |
| 87 | + author_first_name = author_xml.find("./first").text |
56 | 88 | except AttributeError: |
57 | | - first_name = "" |
58 | | - if last_name == args.last_name and first_name == args.first_name: |
| 89 | + author_first_name = None |
| 90 | + author_last_name = author_xml.find("./last").text |
| 91 | + |
| 92 | + if author_last_name == last_name and author_first_name == first_name: |
59 | 93 | paper_id = ( |
60 | 94 | paper_xml.attrib["id"] if paper_xml.text == "paper" else "0" |
61 | 95 | ) |
62 | | - anth_id = f"{xml_file}/{paper_id}" |
63 | | - print(f"Adding {args.id} to {anth_id}...") |
| 96 | + paper_id = anthology.get_paper(paper_tuple).full_id |
| 97 | + print( |
| 98 | + f"Adding {args.id} to {author_first_name} {author_last_name} on paper {paper_id}..." |
| 99 | + ) |
64 | 100 | author_xml.attrib["id"] = args.id |
65 | | - changed_one = True |
66 | 101 |
|
67 | | - if changed_one: |
68 | | - indent(tree.getroot()) |
69 | | - tree.write(xml_file, encoding="UTF-8", xml_declaration=True) |
| 102 | + indent(tree.getroot()) |
| 103 | + tree.write(xml_file, encoding="UTF-8", xml_declaration=True) |
| 104 | + |
| 105 | + """ |
| 106 | + Once we have the module published, we should be able to modify this to use |
| 107 | + it to write the changed XML files, instead of the above. |
| 108 | + """ |
| 109 | + # for paper in person.papers(): |
| 110 | + # print("PAPER", paper.full_id) |
| 111 | + # authors = paper.get_editors() if paper.is_frontmatter else paper.authors |
| 112 | + # for author in authors: |
| 113 | + # if author.name in person.names: |
| 114 | + # print("-> Found", author) |
| 115 | + # author.id = args.id |
| 116 | + # # collection_paper_map[paper.collection_id].append(paper.full_id) |
| 117 | + |
| 118 | + # # save the anthology (doesn't currently work) |
| 119 | + # anthology.save_all() |
70 | 120 |
|
71 | 121 |
|
72 | 122 | if __name__ == "__main__": |
73 | | - parser = argparse.ArgumentParser() |
| 123 | + parser = argparse.ArgumentParser("Add an author ID to all of an author's papers") |
74 | 124 | parser.add_argument("id", help="Author ID to add") |
75 | | - parser.add_argument("--last-name", help="Author's last name") |
76 | | - parser.add_argument("--first-name", help="Author's first name") |
77 | | - parser.add_argument("--confirm", action="store_true", help="Confirm each instance") |
| 125 | + parser.add_argument("name", help="Author's name (last[, first])") |
| 126 | + parser.add_argument("--paper-ids", nargs="*", help="List of paper IDs to modify") |
78 | 127 | parser.add_argument( |
79 | | - "--data-dir", default=os.path.join(os.path.dirname(__file__), "..", "data", "xml") |
| 128 | + "--data-dir", |
| 129 | + default=None, |
| 130 | + help="Path to anthology data directory (default: ../data relative to repository root)", |
80 | 131 | ) |
81 | 132 | args = parser.parse_args() |
| 133 | + # Normalize data_dir to a Path string used by Anthology |
| 134 | + # If the user supplies a path, trust it; otherwise compute relative to this script |
| 135 | + if args.data_dir is None: |
| 136 | + args.data_dir = str(Path(__file__).parent.parent / "data") |
82 | 137 |
|
83 | 138 | main(args) |
0 commit comments