|
| 1 | +import re |
| 2 | +import json |
| 3 | +from pathlib import Path |
| 4 | +from sota_extractor2.data.elastic import Reference2 |
| 5 | +from sota_extractor2.data.references import PReference, PAuthor, ReferenceStore |
| 6 | +from tqdm import tqdm |
| 7 | +from elasticsearch.helpers import bulk |
| 8 | +from elasticsearch_dsl.connections import connections |
| 9 | +import http.client |
| 10 | +import xml.etree.ElementTree as ET |
| 11 | + |
| 12 | +# required for bulk saving |
| 13 | +http.client._MAXHEADERS = 1000 |
| 14 | + |
| 15 | +# papers_path = Path("/tmp/papers/papers-with-abstracts.json") |
| 16 | +papers_path = Path("/tmp/papers/papers-with-abstracts-duplicates.json") |
| 17 | + |
| 18 | + |
| 19 | +def read_pwc_papers(path): |
| 20 | + with open(path, "rt") as f: |
| 21 | + return json.load(f) |
| 22 | + |
| 23 | + |
| 24 | +arxiv_url_re = re.compile(r"^(?:https?://(?:www.)?arxiv.org/(?:abs|pdf|e-print)/)?(?P<arxiv_id>\d{4}\.\d+)(?:v\d+)?(?:\.pdf)?$") |
| 25 | +arxiv_url_only_re = re.compile(r"^(?:https?://(?:www.)?arxiv.org/(?:abs|pdf|e-print)/)(?P<arxiv_id>\d{4}\.\d+)(?:v\d+)?(?:\.pdf)?$") |
| 26 | +pwc_url_re = re.compile(r"^(?:https?://(?:www.)?)paperswithcode.com/paper/(?P<slug>[^/]*)/?$") |
| 27 | + |
| 28 | + |
| 29 | +def from_paper_dict(paper): |
| 30 | + authors = [PAuthor.from_fullname(a) for a in paper["authors"] if a.strip()] |
| 31 | + arxiv_id = None |
| 32 | + if paper["arxiv_id"]: |
| 33 | + arxiv_id = paper["arxiv_id"] |
| 34 | + elif paper["url_abs"]: |
| 35 | + m = arxiv_url_re.match(paper["url_abs"]) |
| 36 | + if m: |
| 37 | + arxiv_id = m.group("arxiv_id") |
| 38 | + title = None |
| 39 | + if paper["title"]: |
| 40 | + title = paper["title"].rstrip(" .") |
| 41 | + slug = None |
| 42 | + if paper["paper_url"]: |
| 43 | + m = pwc_url_re.match(paper["paper_url"]) |
| 44 | + if m: |
| 45 | + slug = m.group("slug") |
| 46 | + return PReference( |
| 47 | + title=title, |
| 48 | + authors=authors, |
| 49 | + ptr=paper["url_pdf"] or paper["url_abs"], |
| 50 | + arxiv_id=arxiv_id, |
| 51 | + pwc_slug=slug, |
| 52 | + date=paper["date"], |
| 53 | + orig_ref=f"{', '.join(paper['authors'])}. {paper['title']}.", |
| 54 | + ) |
| 55 | + |
| 56 | + |
| 57 | +def from_paper_elem(elem): |
| 58 | + authors_str = [a.text.strip() for a in elem.findall("author") if a.text.strip()] |
| 59 | + authors = [PAuthor.from_fullname(a) for a in authors_str] |
| 60 | + arxiv_id = None |
| 61 | + url = None |
| 62 | + for ee in elem.findall("ee"): |
| 63 | + if url is None or "oa" in ee.attrib: # prefere open access urls |
| 64 | + url = ee.text |
| 65 | + m = arxiv_url_only_re.match(ee.text) |
| 66 | + if m: |
| 67 | + url = ee.text |
| 68 | + arxiv_id = m.group("arxiv_id") |
| 69 | + break |
| 70 | + title = None |
| 71 | + title_elem = elem.find("title") |
| 72 | + if title_elem is not None: |
| 73 | + title = title_elem.text.rstrip(" .") |
| 74 | + return PReference( |
| 75 | + title=title, |
| 76 | + authors=authors, |
| 77 | + ptr=url, |
| 78 | + arxiv_id=arxiv_id, |
| 79 | + orig_ref=f"{', '.join(authors_str)}. {title}.", |
| 80 | + ) |
| 81 | + |
| 82 | + |
| 83 | +def merge_references(p_references): |
| 84 | + for p_ref in tqdm(p_references): |
| 85 | + uid = p_ref.unique_id() |
| 86 | + e_ref = elastic_references.get(uid) |
| 87 | + if not e_ref: |
| 88 | + e_ref = Reference2.from_ref(p_ref) |
| 89 | + elastic_references[uid] = e_ref |
| 90 | + e_ref.add_ref(p_ref) |
| 91 | + |
| 92 | + |
| 93 | +def save_all(docs): |
| 94 | + bulk(connections.get_connection(), (d.to_dict(True) for d in docs), chunk_size=500) |
| 95 | + |
| 96 | + |
| 97 | +def init_pwc(): |
| 98 | + # read list of ML papers (titles, abstracts, arxiv ids, etc.) |
| 99 | + all_papers = read_pwc_papers(papers_path) |
| 100 | + |
| 101 | + # change dicts into PReferences |
| 102 | + p_references = [from_paper_dict(paper) for paper in all_papers] |
| 103 | + |
| 104 | + # keep references with valid ids |
| 105 | + p_references = [ref for ref in p_references if ref.unique_id()] |
| 106 | + |
| 107 | + all_ids = list(set(ref.unique_id() for ref in p_references)) |
| 108 | + elastic_references = { |
| 109 | + uid: ref for uid, ref in zip(all_ids, Reference2.mget(all_ids)) |
| 110 | + if ref |
| 111 | + } |
| 112 | + |
| 113 | + merge_references(p_references) |
| 114 | + save_all(elastic_references.values()) |
| 115 | + |
| 116 | + |
| 117 | +def init_dblp(): |
| 118 | + dblp_xml = ET.parse(str(Path.home() / "data" / "dblp" / "dblp-10k-noent.xml")) |
| 119 | + root = dblp_xml.getroot() |
| 120 | + p_references = [from_paper_elem(elem) for elem in root.getchildren()] |
| 121 | + p_references = [ref for ref in p_references if ref.unique_id()] |
| 122 | + |
| 123 | + all_ids = list(set(ref.unique_id() for ref in p_references)) |
| 124 | + elastic_references = { |
| 125 | + uid: ref for uid, ref in zip(all_ids, Reference2.mget(all_ids)) |
| 126 | + if ref |
| 127 | + } |
| 128 | + |
| 129 | + merge_references(p_references) |
| 130 | + save_all(elastic_references.values()) |
| 131 | + |
| 132 | +init_dblp() |
| 133 | +#init_pwc() |
0 commit comments