Skip to content

Commit 3be7094

Browse files
committed
Normalize references fast
1 parent 7baae77 commit 3be7094

File tree

3 files changed

+88
-1
lines changed

3 files changed

+88
-1
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ HTMLS = $(patsubst $(ARCHIVES_DIR)/%.gz,$(HTMLS_DIR)/%.html,$(ARCHIVES))
1313
FIXED_HTMLS = $(patsubst $(ARCHIVES_DIR)/%.gz,$(FIXED_HTMLS_DIR)/%.html,$(ARCHIVES))
1414
TABLES = $(patsubst $(ARCHIVES_DIR)/%.gz,$(TABLES_DIR)/%,$(ARCHIVES))
1515

16-
$(shell mkdir -p "$(DATA_DIR)" "$(UNPACKED_DIR)" "$(HTMLS_DIR)" "$(FIXED_HTMLS_DIR)" "$(TABLES_DIR)")
16+
$(shell mkdir -p "$(DATA_DIR)" "$(ANNOTATIONS_DIR)" "$(UNPACKED_DIR)" "$(HTMLS_DIR)" "$(FIXED_HTMLS_DIR)" "$(TABLES_DIR)")
1717

1818
.PHONY: all
1919
all: $(ANNOTATIONS_DIR)/pdfs-urls.csv $(ANNOTATIONS_DIR)/sources-urls.csv extract_all

environment.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,5 @@ dependencies:
1111
- beautifulsoup4=4.7.1
1212
- numpy=1.15.4
1313
- python=3.7.1
14+
- pyahocorasick=1.4.0
15+
- Unidecode=1.0.23

normalize_references.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
import fire
2+
from unidecode import unidecode
3+
from pathlib import Path
4+
import string
5+
import ahocorasick
6+
import pickle
7+
from multiprocessing import Pool
8+
from sota_extractor2.data.elastic import get_text, Paper
9+
10+
punctuation_table = str.maketrans('', '', string.punctuation)
11+
12+
def normalize_title(title):
13+
return unidecode(title.strip().lower().replace(' ', '')).translate(punctuation_table)
14+
15+
def resolve_references(reference_trie, bibitems):
16+
if len(bibitems) == 0:
17+
return {}
18+
bib_ids = list(bibitems.keys())
19+
texts = list(bibitems.values())
20+
found = 0
21+
resolved = {}
22+
for bib_id, text in zip(bib_ids, texts):
23+
references = [ref for _, ref in reference_trie.iter(normalize_title(text)) if len(normalize_title(ref['title'])) >= 6]
24+
references = sorted(references, key=lambda ref: len(normalize_title(ref['title'])), reverse=True)
25+
for ref in references:
26+
for author in ref['authors']:
27+
if normalize_title(author['name'].split(' ')[-1]) not in normalize_title(text):
28+
break
29+
else:
30+
print(text)
31+
print(ref['title'])
32+
found += 1
33+
resolved[bib_id] = ref['id']
34+
break
35+
print(f"Found {found} ({found / len(bibitems)})")
36+
return resolved
37+
38+
def update_references(html, mapping):
39+
anchors = html.select('[href^="#"]')
40+
for anchor in anchors:
41+
target = anchor['href'][1:]
42+
anchor['href'] = '#' + mapping.get(target, target)
43+
anchors = html.select('a[id]:not([id=""])')
44+
for anchor in anchors:
45+
bib_id = anchor['id']
46+
anchor['id'] = mapping.get(bib_id, bib_id)
47+
48+
def get_bibitems(html):
49+
elems = html.select(".thebibliography p.bibitem")
50+
bibitems = {}
51+
for elem in elems:
52+
anchors = elem.select('a[id]:not([id=""])')
53+
if anchors:
54+
bib_id = anchors[0]['id']
55+
bibitems[bib_id] = get_text(elem)
56+
return bibitems
57+
58+
def save_html(path, html):
59+
with open(path, 'w') as f:
60+
f.write(str(html))
61+
62+
def resolve_references_in_html(args):
63+
file, output = args
64+
output.parent.mkdir(exist_ok=True, parents=True)
65+
html = Paper.read_html(f)
66+
bibitems = get_bibitems(html)
67+
mapping = resolve_references(reference_trie, bibitems)
68+
update_references(html, mapping)
69+
save_html(output, html)
70+
71+
DUMP_REFERENCES_PATH = Path("/home/ubuntu/pwc/mycache/references-short.json")
72+
73+
TRIE_PATH = Path("/home/ubuntu/pwc/mycache/automaton.pkl")
74+
75+
def normalize_references(source_path, target_path, automaton, jobs=1):
76+
global reference_trie
77+
source_path = Path(source_path)
78+
target_path = Path(target_path)
79+
with open(automaton, 'rb') as f:
80+
reference_trie = pickle.load(f)
81+
with Pool(jobs) as p:
82+
params = [(file, target_path / file.name) for file in source_path.glob("**/*.html")]
83+
p.map(resolve_references_in_html, params)
84+
85+
if __name__ == "__main__": fire.Fire(normalize_references)

0 commit comments

Comments
 (0)