Skip to content

Commit 3e76133

Browse files
author
Marcin Kardas
committed
Fixes to reference importing
1 parent 6e6237a commit 3e76133

File tree

3 files changed

+54
-25
lines changed

3 files changed

+54
-25
lines changed

init_references.py

Lines changed: 44 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import re
22
import json
33
from pathlib import Path
4-
from sota_extractor2.data.elastic import Reference2
4+
from collections import Counter
5+
from sota_extractor2.data.elastic import Reference2, setup_default_connection
56
from sota_extractor2.data.references import PReference, PAuthor, ReferenceStore
67
from tqdm import tqdm
78
from elasticsearch.helpers import bulk
@@ -12,8 +13,9 @@
1213
# required for bulk saving
1314
http.client._MAXHEADERS = 1000
1415

15-
# papers_path = Path("/tmp/papers/papers-with-abstracts.json")
16-
papers_path = Path("/tmp/papers/papers-with-abstracts-duplicates.json")
16+
setup_default_connection()
17+
18+
papers_path = Path("/data/dblp/papers/papers-with-abstracts.json")
1719

1820

1921
def read_pwc_papers(path):
@@ -54,23 +56,27 @@ def from_paper_dict(paper):
5456
)
5557

5658

59+
def _text(elem): return "".join(elem.itertext())
60+
61+
5762
def from_paper_elem(elem):
58-
authors_str = [a.text.strip() for a in elem.findall("author") if a.text.strip()]
63+
authors_str = [_text(a).strip() for a in elem.findall("author")]
64+
authors_str = [s for s in authors_str if s]
5965
authors = [PAuthor.from_fullname(a) for a in authors_str]
6066
arxiv_id = None
6167
url = None
6268
for ee in elem.findall("ee"):
6369
if url is None or "oa" in ee.attrib: # prefere open access urls
64-
url = ee.text
65-
m = arxiv_url_only_re.match(ee.text)
70+
url = _text(ee)
71+
m = arxiv_url_only_re.match(_text(ee))
6672
if m:
67-
url = ee.text
73+
url = _text(ee) # prefere arxiv urls
6874
arxiv_id = m.group("arxiv_id")
6975
break
7076
title = None
7177
title_elem = elem.find("title")
7278
if title_elem is not None:
73-
title = title_elem.text.rstrip(" .")
79+
title = _text(title_elem).rstrip(" .")
7480
return PReference(
7581
title=title,
7682
authors=authors,
@@ -80,9 +86,14 @@ def from_paper_elem(elem):
8086
)
8187

8288

83-
def merge_references(p_references):
89+
def merge_references(p_references, elastic_references):
90+
uids = Counter([p_ref.unique_id() for p_ref in p_references])
8491
for p_ref in tqdm(p_references):
8592
uid = p_ref.unique_id()
93+
# ignore papers with too common title
94+
# (often these are "Editorial", "Preface", "Letter")
95+
if uids[uid] > 5:
96+
continue
8697
e_ref = elastic_references.get(uid)
8798
if not e_ref:
8899
e_ref = Reference2.from_ref(p_ref)
@@ -94,6 +105,19 @@ def save_all(docs):
94105
bulk(connections.get_connection(), (d.to_dict(True) for d in docs), chunk_size=500)
95106

96107

108+
def get_elastic_references(unique_ids, chunk_size=1000):
109+
elastic_references = {}
110+
i = 0
111+
while i < len(unique_ids):
112+
ids = unique_ids[i:i+chunk_size]
113+
i += chunk_size
114+
elastic_references.update({
115+
uid: ref for uid, ref in zip(ids, Reference2.mget(ids))
116+
if ref
117+
})
118+
return elastic_references
119+
120+
97121
def init_pwc():
98122
# read list of ML papers (titles, abstracts, arxiv ids, etc.)
99123
all_papers = read_pwc_papers(papers_path)
@@ -105,29 +129,26 @@ def init_pwc():
105129
p_references = [ref for ref in p_references if ref.unique_id()]
106130

107131
all_ids = list(set(ref.unique_id() for ref in p_references))
108-
elastic_references = {
109-
uid: ref for uid, ref in zip(all_ids, Reference2.mget(all_ids))
110-
if ref
111-
}
112-
113-
merge_references(p_references)
132+
elastic_references = get_elastic_references(all_ids)
133+
merge_references(p_references, elastic_references)
114134
save_all(elastic_references.values())
115135

116136

117137
def init_dblp():
118-
dblp_xml = ET.parse(str(Path.home() / "data" / "dblp" / "dblp-10k-noent.xml"))
138+
dblp_xml = ET.parse(str(Path("/data") / "dblp" / "dblp-noent.xml"))
139+
#dblp_xml = ET.parse(str(Path("/data") / "dblp" / "dblp-small-noent.xml"))
119140
root = dblp_xml.getroot()
120-
p_references = [from_paper_elem(elem) for elem in root.getchildren()]
141+
p_references = [from_paper_elem(elem) for elem in root]
121142
p_references = [ref for ref in p_references if ref.unique_id()]
122143

123144
all_ids = list(set(ref.unique_id() for ref in p_references))
124-
elastic_references = {
125-
uid: ref for uid, ref in zip(all_ids, Reference2.mget(all_ids))
126-
if ref
127-
}
145+
# todo: add references2 index initialization
146+
elastic_references = {} #get_elastic_references(all_ids)
128147

129-
merge_references(p_references)
148+
merge_references(p_references, elastic_references)
130149
save_all(elastic_references.values())
131150

151+
# Reference2._index.delete()
152+
Reference2.init()
132153
init_dblp()
133-
#init_pwc()
154+
init_pwc()

sota_extractor2/data/elastic.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -296,11 +296,18 @@ class Index:
296296
def __repr__(self):
297297
return f"{self.title} / {self.authors}"
298298

299+
299300
ID_LIMIT=480
300301

302+
303+
class Author2(InnerDoc):
304+
forenames = Text(fields={'keyword': Keyword()})
305+
surname = Text(fields={'keyword': Keyword()})
306+
307+
301308
class Reference2(Document):
302309
title = Text()
303-
authors = Text()
310+
authors = Object(Author2)
304311

305312
idno = Keyword()
306313
date = Date()

sota_extractor2/data/references.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import json
77
import regex as re
8+
from unidecode import unidecode
89
import requests
910
import shelve
1011
import xmltodict
@@ -33,7 +34,7 @@ def strip_anchor(ref_str):
3334

3435
_tokenizer_re = re.compile(r'[^/a-z0-9\\:?#\[\]\(\).-–]+')
3536
def normalize_title(s, join=True):
36-
toks = _tokenizer_re.split(s.lower())
37+
toks = _tokenizer_re.split(unidecode(s).lower())
3738
return "-".join(toks).strip() if join else toks
3839

3940
def to_normal_dict(d):

0 commit comments

Comments
 (0)