Skip to content

Commit 6489719

Browse files
author
Marcin Kardas
committed
Reference extraction
1 parent 3e76133 commit 6489719

File tree

5 files changed

+58
-40
lines changed

5 files changed

+58
-40
lines changed

init_references.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
import json
33
from pathlib import Path
44
from collections import Counter
5-
from sota_extractor2.data.elastic import Reference2, setup_default_connection
5+
from sota_extractor2.data.elastic import Reference2
6+
from elasticsearch_dsl import connections
67
from sota_extractor2.data.references import PReference, PAuthor, ReferenceStore
78
from tqdm import tqdm
89
from elasticsearch.helpers import bulk
@@ -13,7 +14,7 @@
1314
# required for bulk saving
1415
http.client._MAXHEADERS = 1000
1516

16-
setup_default_connection()
17+
connections.create_connection(hosts=['elasticsearch'], timeout=20)
1718

1819
papers_path = Path("/data/dblp/papers/papers-with-abstracts.json")
1920

parse_references.py

Lines changed: 1 addition & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -17,33 +17,7 @@
1717
pc = PaperCollection.from_pickle("/mnt/efs/pwc/data/pc-small-noann.pkl")
1818

1919

20-
def get_refstrings(p):
21-
paper = p.text if hasattr(p, 'text') else p
22-
if not hasattr(paper, 'fragments'):
23-
return
24-
fragments = paper.fragments
25-
ref_sec_started = False
26-
for f in reversed(fragments):
27-
if f.header.startswith('xxanchor-bib'):
28-
ref_sec_started = True
29-
yield f.text
30-
elif ref_sec_started:
31-
# todo: check if a paper can have multiple bibliography sections
32-
# (f.e., one in the main paper and one in the appendix)
33-
break # the refsection is only at the end of paper
34-
35-
36-
_ref_re = regex.compile(r'^\s*(?:xxanchor-bib\s)?xxanchor-([a-zA-Z0-9-]+)\s(.+)$')
37-
def extract_refs(p):
38-
for ref in get_refstrings(p):
39-
m = _ref_re.match(ref)
40-
if m:
41-
ref_id, ref_str = m.groups()
42-
yield {
43-
"paper_arxiv_id": p.arxiv_no_version,
44-
"ref_id": ref_id,
45-
"ref_str": ref_str.strip(r'\s')
46-
}
20+
4721

4822
class PaperCollectionReferenceParser:
4923
def __init__(self):

sota_extractor2/data/references.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -344,3 +344,33 @@ def sync(self):
344344
p.save()
345345
except ConflictError:
346346
pass
347+
348+
349+
def get_refstrings(p):
350+
paper = p.text if hasattr(p, 'text') else p
351+
if not hasattr(paper, 'fragments'):
352+
return
353+
fragments = paper.fragments
354+
ref_sec_started = False
355+
for f in reversed(fragments):
356+
if f.header.startswith('xxanchor-bib'):
357+
ref_sec_started = True
358+
yield f.text
359+
elif ref_sec_started:
360+
# todo: check if a paper can have multiple bibliography sections
361+
# (f.e., one in the main paper and one in the appendix)
362+
break # the refsection is only at the end of paper
363+
364+
365+
366+
_ref_re = re.compile(r'^\s*(?:xxanchor-bib\s)?xxanchor-([a-zA-Z0-9-]+)\s(.+)$')
367+
def extract_refs(p):
368+
for ref in get_refstrings(p):
369+
m = _ref_re.match(ref)
370+
if m:
371+
ref_id, ref_str = m.groups()
372+
yield {
373+
"paper_arxiv_id": p.arxiv_no_version,
374+
"ref_id": ref_id,
375+
"ref_str": ref_str.strip(r'\s')
376+
}

sota_extractor2/data/table.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ class Cell:
2424

2525
def extract_references(s):
2626
parts = reference_re.split(s)
27-
refs = parts[1::3]
27+
refs = [r.replace('-', '') for r in parts[1::3]]
2828
text = []
2929
for i, x in enumerate(parts):
3030
if i % 3 == 0:

sota_extractor2/helpers/cache.py

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import pandas as pd
22
import json
33
from collections import defaultdict
4+
from pathlib import Path
45

56

67
# these functions are used to cache various results
@@ -10,26 +11,38 @@
1011
# can be changed.
1112

1213

14+
def _load_json(path):
15+
with Path(path).open('rt') as f:
16+
return json.load(f)
17+
18+
19+
def _save_json(obj, path):
20+
with Path(path).open('wt') as f:
21+
json.dump(obj, f)
22+
23+
24+
def load_references(path):
25+
return _load_json(path)
26+
27+
28+
def save_references(references, path):
29+
_save_json(references, path)
30+
31+
1332
def load_tags(path):
14-
with open(path, 'rt') as f:
15-
tags = json.load(f)
16-
return tags
33+
return _load_json(path)
1734

1835

1936
def save_tags(tags, path):
20-
with open(path, 'wt') as f:
21-
json.dump(tags, f)
37+
_save_json(tags, path)
2238

2339

2440
def load_structure(path):
25-
with open(path, 'rt') as f:
26-
structure = json.load(f)
27-
return structure
41+
return _load_json(path)
2842

2943

3044
def save_structure(structure, path):
31-
with open(path, 'wt') as f:
32-
json.dump(structure, f)
45+
_save_json(structure, path)
3346

3447

3548
def load_proposals(path):

0 commit comments

Comments
 (0)