Skip to content

Commit f442bd3

Browse files
author
Marcin Kardas
authored
Merge pull request #5 from paperswithcode/push-api
Multiple improvements to results extraction
2 parents acdd321 + fdb06d0 commit f442bd3

33 files changed

+1334
-404
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,3 +98,6 @@ venv.bak/
9898
.mypy_cache/
9999
.idea/*
100100
.vscode/settings.json
101+
102+
# pytest
103+
.pytest_cache

README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,9 @@ To test the whole extraction on a single file run
3434
```
3535
make test
3636
```
37+
38+
### Unit Tests
39+
40+
```
41+
PYTHONPATH=. py.test
42+
```

environment.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ dependencies:
1313
- python=3.7.1
1414
- pyahocorasick=1.4.0
1515
- Unidecode=1.0.23
16-
- elasticsearch-dsl=7.0.0
16+
- elasticsearch-dsl=6.3.1
1717
- ipython=7.5.0
1818
- joblib=0.13.2
1919
- python-magic=0.4.15

extract_tables.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,16 @@ def save_tables(data, outdir):
276276
json.dump(metadata, f)
277277

278278

279+
def load_tables(path):
280+
path = Path(path)
281+
with open(path / "metadata.json", "r") as f:
282+
metadata = json.load(f)
283+
284+
return [Table.from_file(
285+
path,
286+
table_metadata) for table_metadata in metadata]
287+
288+
279289
def set_ids_by_labels(soup):
280290
captions = soup.select(".ltx_caption")
281291
for caption in captions:

init_references.py

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
import re
2+
import json
3+
from pathlib import Path
4+
from collections import Counter
5+
from sota_extractor2.data.elastic import Reference2
6+
from elasticsearch_dsl import connections
7+
from sota_extractor2.data.references import PReference, PAuthor, ReferenceStore
8+
from tqdm import tqdm
9+
from elasticsearch.helpers import bulk
10+
from elasticsearch_dsl.connections import connections
11+
import http.client
12+
import xml.etree.ElementTree as ET
13+
14+
# required for bulk saving
15+
http.client._MAXHEADERS = 1000
16+
17+
connections.create_connection(hosts=['elasticsearch'], timeout=20)
18+
19+
papers_path = Path("/data/dblp/papers/papers-with-abstracts.json")
20+
21+
22+
def read_pwc_papers(path):
23+
with open(path, "rt") as f:
24+
return json.load(f)
25+
26+
27+
arxiv_url_re = re.compile(r"^(?:https?://(?:www.)?arxiv.org/(?:abs|pdf|e-print)/)?(?P<arxiv_id>\d{4}\.\d+)(?:v\d+)?(?:\.pdf)?$")
28+
arxiv_url_only_re = re.compile(r"^(?:https?://(?:www.)?arxiv.org/(?:abs|pdf|e-print)/)(?P<arxiv_id>\d{4}\.\d+)(?:v\d+)?(?:\.pdf)?$")
29+
pwc_url_re = re.compile(r"^(?:https?://(?:www.)?)paperswithcode.com/paper/(?P<slug>[^/]*)/?$")
30+
31+
32+
def from_paper_dict(paper):
33+
authors = [PAuthor.from_fullname(a) for a in paper["authors"] if a.strip()]
34+
arxiv_id = None
35+
if paper["arxiv_id"]:
36+
arxiv_id = paper["arxiv_id"]
37+
elif paper["url_abs"]:
38+
m = arxiv_url_re.match(paper["url_abs"])
39+
if m:
40+
arxiv_id = m.group("arxiv_id")
41+
title = None
42+
if paper["title"]:
43+
title = paper["title"].rstrip(" .")
44+
slug = None
45+
if paper["paper_url"]:
46+
m = pwc_url_re.match(paper["paper_url"])
47+
if m:
48+
slug = m.group("slug")
49+
return PReference(
50+
title=title,
51+
authors=authors,
52+
ptr=paper["url_pdf"] or paper["url_abs"],
53+
arxiv_id=arxiv_id,
54+
pwc_slug=slug,
55+
date=paper["date"],
56+
orig_ref=f"{', '.join(paper['authors'])}. {paper['title']}.",
57+
)
58+
59+
60+
def _text(elem): return "".join(elem.itertext())
61+
62+
63+
def from_paper_elem(elem):
64+
authors_str = [_text(a).strip() for a in elem.findall("author")]
65+
authors_str = [s for s in authors_str if s]
66+
authors = [PAuthor.from_fullname(a) for a in authors_str]
67+
arxiv_id = None
68+
url = None
69+
for ee in elem.findall("ee"):
70+
if url is None or "oa" in ee.attrib: # prefere open access urls
71+
url = _text(ee)
72+
m = arxiv_url_only_re.match(_text(ee))
73+
if m:
74+
url = _text(ee) # prefere arxiv urls
75+
arxiv_id = m.group("arxiv_id")
76+
break
77+
title = None
78+
title_elem = elem.find("title")
79+
if title_elem is not None:
80+
title = _text(title_elem).rstrip(" .")
81+
return PReference(
82+
title=title,
83+
authors=authors,
84+
ptr=url,
85+
arxiv_id=arxiv_id,
86+
orig_ref=f"{', '.join(authors_str)}. {title}.",
87+
)
88+
89+
90+
def merge_references(p_references, elastic_references):
91+
uids = Counter([p_ref.unique_id() for p_ref in p_references])
92+
for p_ref in tqdm(p_references):
93+
uid = p_ref.unique_id()
94+
# ignore papers with too common title
95+
# (often these are "Editorial", "Preface", "Letter")
96+
if uids[uid] > 5:
97+
continue
98+
e_ref = elastic_references.get(uid)
99+
if not e_ref:
100+
e_ref = Reference2.from_ref(p_ref)
101+
elastic_references[uid] = e_ref
102+
e_ref.add_ref(p_ref)
103+
104+
105+
def save_all(docs):
106+
bulk(connections.get_connection(), (d.to_dict(True) for d in docs), chunk_size=500)
107+
108+
109+
def get_elastic_references(unique_ids, chunk_size=1000):
110+
elastic_references = {}
111+
i = 0
112+
while i < len(unique_ids):
113+
ids = unique_ids[i:i+chunk_size]
114+
i += chunk_size
115+
elastic_references.update({
116+
uid: ref for uid, ref in zip(ids, Reference2.mget(ids))
117+
if ref
118+
})
119+
return elastic_references
120+
121+
122+
def init_pwc():
123+
# read list of ML papers (titles, abstracts, arxiv ids, etc.)
124+
all_papers = read_pwc_papers(papers_path)
125+
126+
# change dicts into PReferences
127+
p_references = [from_paper_dict(paper) for paper in all_papers]
128+
129+
# keep references with valid ids
130+
p_references = [ref for ref in p_references if ref.unique_id()]
131+
132+
all_ids = list(set(ref.unique_id() for ref in p_references))
133+
elastic_references = get_elastic_references(all_ids)
134+
merge_references(p_references, elastic_references)
135+
save_all(elastic_references.values())
136+
137+
138+
def init_dblp():
139+
dblp_xml = ET.parse(str(Path("/data") / "dblp" / "dblp-noent.xml"))
140+
#dblp_xml = ET.parse(str(Path("/data") / "dblp" / "dblp-small-noent.xml"))
141+
root = dblp_xml.getroot()
142+
p_references = [from_paper_elem(elem) for elem in root]
143+
p_references = [ref for ref in p_references if ref.unique_id()]
144+
145+
all_ids = list(set(ref.unique_id() for ref in p_references))
146+
# todo: add references2 index initialization
147+
elastic_references = {} #get_elastic_references(all_ids)
148+
149+
merge_references(p_references, elastic_references)
150+
save_all(elastic_references.values())
151+
152+
# Reference2._index.delete()
153+
Reference2.init()
154+
init_dblp()
155+
init_pwc()

latex2html.sh

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
#!/usr/bin/env bash
22
OUTNAME="$1"
33
echo $OUTNAME
4-
RO_SOURCE_DIR="/files/ro-source"
4+
RO_SOURCE_DIR="${2:-/files/ro-source}"
55
SOURCE_DIR="/files/source"
6-
OUTPUT_DIR="/files/htmls"
6+
OUTPUT_DIR="${3:-/files/htmls}"
77

8+
mkdir -p /files
89
cp -r "$RO_SOURCE_DIR" "$SOURCE_DIR"
910

1011
# turn tikzpciture instances into comments
@@ -23,6 +24,9 @@ do
2324
done
2425

2526
MAINTEX=$(python3 /files/guess_main.py "$SOURCE_DIR")
27+
[ ! -f "$MAINTEX" ] && exit 1
28+
2629
timeout -s KILL 300 engrafo "$MAINTEX" /files/output
2730

31+
[ ! -f /files/output/index.html ] && exit 117
2832
cp /files/output/index.html "$OUTPUT_DIR/$OUTNAME"

parse_references.py

Lines changed: 3 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -17,31 +17,7 @@
1717
pc = PaperCollection.from_pickle("/mnt/efs/pwc/data/pc-small-noann.pkl")
1818

1919

20-
def get_refstrings(p):
21-
paper = p.text if hasattr(p, 'text') else p
22-
if not hasattr(paper, 'fragments'):
23-
return
24-
fragments = paper.fragments
25-
ref_sec_started = False
26-
for f in reversed(fragments):
27-
if f.header.startswith('xxanchor-bib'):
28-
ref_sec_started = True
29-
yield f.text
30-
elif ref_sec_started:
31-
break # the refsection is only at the end of paper
32-
33-
34-
_ref_re = regex.compile(r'^\s*(?:xxanchor-bib\s)?xxanchor-([a-zA-Z0-9-]+)\s(.+)$')
35-
def extract_refs(p):
36-
for ref in get_refstrings(p):
37-
m = _ref_re.match(ref)
38-
if m:
39-
ref_id, ref_str = m.groups()
40-
yield {
41-
"paper_arxiv_id": p.arxiv_no_version,
42-
"ref_id": ref_id,
43-
"ref_str": ref_str.strip(r'\s')
44-
}
20+
4521

4622
class PaperCollectionReferenceParser:
4723
def __init__(self):
@@ -52,13 +28,13 @@ def __init__(self):
5228
def parse_refs(self, p):
5329
for d in extract_refs(p):
5430
if not d["ref_id"].startswith("pwc-"):
55-
key = d["paper_arxiv_id"] + d["ref_id"]
31+
key = p.arxiv_no_version + d["ref_id"]
5632
if key not in self.cache:
5733
new_id = self.refsdb.add_reference_string(d['ref_str'])
5834
if new_id is not None:
5935
new_id = "pwc-" + new_id
6036
self.cache[key] = new_id
61-
if self.cache[key] and len(self.cache[key]) > 500: # fix to self.cache to make the id compatible with elastic
37+
if self.cache[key] and len(self.cache[key]) > ID_LIMIT: # fix to self.cache to make the id compatible with elastic
6238
self.cache[key] = self.cache[key][:ID_LIMIT]
6339
yield d["ref_id"], self.cache[key]
6440
self.refsdb.sync()

sota_extractor2/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515

1616
elastic = dict(hosts=['localhost'], timeout=20)
17-
grobid = dict(host='10.0.1.145')
17+
grobid = dict(host='grobid')
1818

1919
arxiv = data/'arxiv'
2020
htmls_raw = arxiv/'htmls'

sota_extractor2/data/elastic.py

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,11 @@ class Fragment(Document):
108108
)
109109
outer_headers = Text(analyzer=html_strip, )
110110

111+
class Meta:
112+
doc_type = '_doc'
113+
111114
class Index:
115+
doc_type = '_doc'
112116
name = 'paper-fragments'
113117

114118
@classmethod
@@ -138,7 +142,11 @@ class Paper(Document):
138142
analyzer=html_strip
139143
)
140144

145+
class Meta:
146+
doc_type = '_doc'
147+
141148
class Index:
149+
doc_type = '_doc'
142150
name = 'papers'
143151

144152
def to_json(self):
@@ -290,26 +298,42 @@ class Reference(Document):
290298
urls = Keyword()
291299
is_ml = Boolean()
292300

301+
class Meta:
302+
doc_type = '_doc'
303+
293304
class Index:
305+
doc_type = '_doc'
294306
name = 'references'
295307

296308
def __repr__(self):
297309
return f"{self.title} / {self.authors}"
298310

311+
299312
ID_LIMIT=480
300313

314+
315+
class Author2(InnerDoc):
316+
forenames = Text(fields={'keyword': Keyword()})
317+
surname = Text(fields={'keyword': Keyword()})
318+
319+
301320
class Reference2(Document):
302321
title = Text()
303-
authors = Text()
322+
authors = Object(Author2)
304323

305324
idno = Keyword()
306325
date = Date()
307326
ptr = Keyword()
308327

309328
arxiv_id = Keyword()
329+
pwc_slug = Keyword()
310330
orig_refs = Text()
311331

332+
class Meta:
333+
doc_type = '_doc'
334+
312335
class Index:
336+
doc_type = '_doc'
313337
name = 'references2'
314338

315339
def add_ref(self, ref):
@@ -318,14 +342,15 @@ def add_ref(self, ref):
318342
# self.refs.append(asdict(ref))
319343
if ref.arxiv_id:
320344
self.arxiv_id = ref.arxiv_id
345+
if ref.pwc_slug:
346+
self.pwc_slug = ref.pwc_slug
321347
if ref.idno:
322348
if hasattr(ref.idno, 'values'):
323349
self.idno = ([None]+[v for v in ref.idno.values() if v.startswith("http")]).pop()
324350
elif isinstance(ref.idno, str):
325351
self.idno = ref.idno
326352
# if ref.date:
327353
# self.date = ref.date
328-
self.date = None
329354
if ref.ptr:
330355
self.ptr = ref.ptr
331356
self.orig_refs = self.orig_refs if self.orig_refs else []
@@ -414,3 +439,19 @@ def display_fragment(f, cell_type="", display=True):
414439
if display:
415440
display_html(html)
416441
return html
442+
443+
444+
def query_for_evidences(paper_id, values, topk=5, fragment_size=50):
445+
evidence_query = Fragment.search().highlight(
446+
'text', pre_tags="<b>", post_tags="</b>", fragment_size=fragment_size)
447+
448+
query = {
449+
"query": ' '.join(values)
450+
}
451+
452+
fragments = list(evidence_query
453+
.filter('term', paper_id=paper_id)
454+
.query('match', text=query)[:topk]
455+
)
456+
457+
return '\n'.join([' '.join(f.meta['highlight']['text']) for f in fragments])

0 commit comments

Comments
 (0)