Skip to content

Commit 6e6237a

Browse files
author
Marcin Kardas
committed
Prepopulate references index
1 parent 064b9ee commit 6e6237a

File tree

1 file changed

+133
-0
lines changed

1 file changed

+133
-0
lines changed

init_references.py

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
import re
2+
import json
3+
from pathlib import Path
4+
from sota_extractor2.data.elastic import Reference2
5+
from sota_extractor2.data.references import PReference, PAuthor, ReferenceStore
6+
from tqdm import tqdm
7+
from elasticsearch.helpers import bulk
8+
from elasticsearch_dsl.connections import connections
9+
import http.client
10+
import xml.etree.ElementTree as ET
11+
12+
# required for bulk saving
13+
http.client._MAXHEADERS = 1000
14+
15+
# papers_path = Path("/tmp/papers/papers-with-abstracts.json")
16+
papers_path = Path("/tmp/papers/papers-with-abstracts-duplicates.json")
17+
18+
19+
def read_pwc_papers(path):
20+
with open(path, "rt") as f:
21+
return json.load(f)
22+
23+
24+
arxiv_url_re = re.compile(r"^(?:https?://(?:www.)?arxiv.org/(?:abs|pdf|e-print)/)?(?P<arxiv_id>\d{4}\.\d+)(?:v\d+)?(?:\.pdf)?$")
25+
arxiv_url_only_re = re.compile(r"^(?:https?://(?:www.)?arxiv.org/(?:abs|pdf|e-print)/)(?P<arxiv_id>\d{4}\.\d+)(?:v\d+)?(?:\.pdf)?$")
26+
pwc_url_re = re.compile(r"^(?:https?://(?:www.)?)paperswithcode.com/paper/(?P<slug>[^/]*)/?$")
27+
28+
29+
def from_paper_dict(paper):
30+
authors = [PAuthor.from_fullname(a) for a in paper["authors"] if a.strip()]
31+
arxiv_id = None
32+
if paper["arxiv_id"]:
33+
arxiv_id = paper["arxiv_id"]
34+
elif paper["url_abs"]:
35+
m = arxiv_url_re.match(paper["url_abs"])
36+
if m:
37+
arxiv_id = m.group("arxiv_id")
38+
title = None
39+
if paper["title"]:
40+
title = paper["title"].rstrip(" .")
41+
slug = None
42+
if paper["paper_url"]:
43+
m = pwc_url_re.match(paper["paper_url"])
44+
if m:
45+
slug = m.group("slug")
46+
return PReference(
47+
title=title,
48+
authors=authors,
49+
ptr=paper["url_pdf"] or paper["url_abs"],
50+
arxiv_id=arxiv_id,
51+
pwc_slug=slug,
52+
date=paper["date"],
53+
orig_ref=f"{', '.join(paper['authors'])}. {paper['title']}.",
54+
)
55+
56+
57+
def from_paper_elem(elem):
58+
authors_str = [a.text.strip() for a in elem.findall("author") if a.text.strip()]
59+
authors = [PAuthor.from_fullname(a) for a in authors_str]
60+
arxiv_id = None
61+
url = None
62+
for ee in elem.findall("ee"):
63+
if url is None or "oa" in ee.attrib: # prefere open access urls
64+
url = ee.text
65+
m = arxiv_url_only_re.match(ee.text)
66+
if m:
67+
url = ee.text
68+
arxiv_id = m.group("arxiv_id")
69+
break
70+
title = None
71+
title_elem = elem.find("title")
72+
if title_elem is not None:
73+
title = title_elem.text.rstrip(" .")
74+
return PReference(
75+
title=title,
76+
authors=authors,
77+
ptr=url,
78+
arxiv_id=arxiv_id,
79+
orig_ref=f"{', '.join(authors_str)}. {title}.",
80+
)
81+
82+
83+
def merge_references(p_references):
84+
for p_ref in tqdm(p_references):
85+
uid = p_ref.unique_id()
86+
e_ref = elastic_references.get(uid)
87+
if not e_ref:
88+
e_ref = Reference2.from_ref(p_ref)
89+
elastic_references[uid] = e_ref
90+
e_ref.add_ref(p_ref)
91+
92+
93+
def save_all(docs):
94+
bulk(connections.get_connection(), (d.to_dict(True) for d in docs), chunk_size=500)
95+
96+
97+
def init_pwc():
98+
# read list of ML papers (titles, abstracts, arxiv ids, etc.)
99+
all_papers = read_pwc_papers(papers_path)
100+
101+
# change dicts into PReferences
102+
p_references = [from_paper_dict(paper) for paper in all_papers]
103+
104+
# keep references with valid ids
105+
p_references = [ref for ref in p_references if ref.unique_id()]
106+
107+
all_ids = list(set(ref.unique_id() for ref in p_references))
108+
elastic_references = {
109+
uid: ref for uid, ref in zip(all_ids, Reference2.mget(all_ids))
110+
if ref
111+
}
112+
113+
merge_references(p_references)
114+
save_all(elastic_references.values())
115+
116+
117+
def init_dblp():
118+
dblp_xml = ET.parse(str(Path.home() / "data" / "dblp" / "dblp-10k-noent.xml"))
119+
root = dblp_xml.getroot()
120+
p_references = [from_paper_elem(elem) for elem in root.getchildren()]
121+
p_references = [ref for ref in p_references if ref.unique_id()]
122+
123+
all_ids = list(set(ref.unique_id() for ref in p_references))
124+
elastic_references = {
125+
uid: ref for uid, ref in zip(all_ids, Reference2.mget(all_ids))
126+
if ref
127+
}
128+
129+
merge_references(p_references)
130+
save_all(elastic_references.values())
131+
132+
init_dblp()
133+
#init_pwc()

0 commit comments

Comments
 (0)