|
| 1 | +import fire |
| 2 | +from unidecode import unidecode |
| 3 | +from pathlib import Path |
| 4 | +import string |
| 5 | +import ahocorasick |
| 6 | +import pickle |
| 7 | +from multiprocessing import Pool |
| 8 | +from sota_extractor2.data.elastic import get_text, Paper |
| 9 | + |
| 10 | +punctuation_table = str.maketrans('', '', string.punctuation) |
| 11 | + |
| 12 | +def normalize_title(title): |
| 13 | + return unidecode(title.strip().lower().replace(' ', '')).translate(punctuation_table) |
| 14 | + |
| 15 | +def resolve_references(reference_trie, bibitems): |
| 16 | + if len(bibitems) == 0: |
| 17 | + return {} |
| 18 | + bib_ids = list(bibitems.keys()) |
| 19 | + texts = list(bibitems.values()) |
| 20 | + found = 0 |
| 21 | + resolved = {} |
| 22 | + for bib_id, text in zip(bib_ids, texts): |
| 23 | + references = [ref for _, ref in reference_trie.iter(normalize_title(text)) if len(normalize_title(ref['title'])) >= 6] |
| 24 | + references = sorted(references, key=lambda ref: len(normalize_title(ref['title'])), reverse=True) |
| 25 | + for ref in references: |
| 26 | + for author in ref['authors']: |
| 27 | + if normalize_title(author['name'].split(' ')[-1]) not in normalize_title(text): |
| 28 | + break |
| 29 | + else: |
| 30 | + print(text) |
| 31 | + print(ref['title']) |
| 32 | + found += 1 |
| 33 | + resolved[bib_id] = ref['id'] |
| 34 | + break |
| 35 | + print(f"Found {found} ({found / len(bibitems)})") |
| 36 | + return resolved |
| 37 | + |
| 38 | +def update_references(html, mapping): |
| 39 | + anchors = html.select('[href^="#"]') |
| 40 | + for anchor in anchors: |
| 41 | + target = anchor['href'][1:] |
| 42 | + anchor['href'] = '#' + mapping.get(target, target) |
| 43 | + anchors = html.select('a[id]:not([id=""])') |
| 44 | + for anchor in anchors: |
| 45 | + bib_id = anchor['id'] |
| 46 | + anchor['id'] = mapping.get(bib_id, bib_id) |
| 47 | + |
| 48 | +def get_bibitems(html): |
| 49 | + elems = html.select(".thebibliography p.bibitem") |
| 50 | + bibitems = {} |
| 51 | + for elem in elems: |
| 52 | + anchors = elem.select('a[id]:not([id=""])') |
| 53 | + if anchors: |
| 54 | + bib_id = anchors[0]['id'] |
| 55 | + bibitems[bib_id] = get_text(elem) |
| 56 | + return bibitems |
| 57 | + |
| 58 | +def save_html(path, html): |
| 59 | + with open(path, 'w') as f: |
| 60 | + f.write(str(html)) |
| 61 | + |
| 62 | +def resolve_references_in_html(args): |
| 63 | + file, output = args |
| 64 | + output.parent.mkdir(exist_ok=True, parents=True) |
| 65 | + html = Paper.read_html(f) |
| 66 | + bibitems = get_bibitems(html) |
| 67 | + mapping = resolve_references(reference_trie, bibitems) |
| 68 | + update_references(html, mapping) |
| 69 | + save_html(output, html) |
| 70 | + |
| 71 | +DUMP_REFERENCES_PATH = Path("/home/ubuntu/pwc/mycache/references-short.json") |
| 72 | + |
| 73 | +TRIE_PATH = Path("/home/ubuntu/pwc/mycache/automaton.pkl") |
| 74 | + |
| 75 | +def normalize_references(source_path, target_path, automaton, jobs=1): |
| 76 | + global reference_trie |
| 77 | + source_path = Path(source_path) |
| 78 | + target_path = Path(target_path) |
| 79 | + with open(automaton, 'rb') as f: |
| 80 | + reference_trie = pickle.load(f) |
| 81 | + with Pool(jobs) as p: |
| 82 | + params = [(file, target_path / file.name) for file in source_path.glob("**/*.html")] |
| 83 | + p.map(resolve_references_in_html, params) |
| 84 | + |
| 85 | +if __name__ == "__main__": fire.Fire(normalize_references) |
0 commit comments