|
| 1 | +from pathlib import Path |
| 2 | +from axcell.helpers import LatexConverter, Unpack |
| 3 | +from axcell.errors import UnpackError, LatexConversionError |
| 4 | +from axcell.data.elastic import Paper as PaperText |
| 5 | +import axcell.data.extract_tables as table_extraction |
| 6 | + |
| 7 | +import re |
| 8 | +import warnings |
| 9 | + |
| 10 | +arxiv_re = re.compile(r"^(?P<arxiv_id>\d{4}\.\d+(v\d+)?)(\..*)?$") |
| 11 | + |
| 12 | + |
| 13 | +class PaperExtractor: |
| 14 | + def __init__(self, root): |
| 15 | + self.root = Path(root) |
| 16 | + self.unpack = Unpack() |
| 17 | + self.latex = LatexConverter() |
| 18 | + |
| 19 | + def __call__(self, source): |
| 20 | + source = Path(source) |
| 21 | + |
| 22 | + m = arxiv_re.match(source.name) |
| 23 | + if not m: |
| 24 | + warnings.warn(f'Unable to infer arxiv_id from "{source.name}" filename') |
| 25 | + arxiv_id = source.name |
| 26 | + else: |
| 27 | + arxiv_id = m.group('arxiv_id') |
| 28 | + |
| 29 | + subpath = source.relative_to(self.root / 'sources').parent / arxiv_id |
| 30 | + unpack_path = self.root / 'unpacked_sources' / subpath |
| 31 | + try: |
| 32 | + self.unpack(source, unpack_path) |
| 33 | + except UnpackError as e: |
| 34 | + if e.message.startswith('The paper has been withdrawn'): |
| 35 | + return 'withdrawn' |
| 36 | + return 'no-tex' |
| 37 | + html_path = self.root / 'htmls' / subpath / 'index.html' |
| 38 | + try: |
| 39 | + html = self.latex.to_html(unpack_path) |
| 40 | + html_path.parent.mkdir(parents=True, exist_ok=True) |
| 41 | + html_path.write_text(html, 'utf-8') |
| 42 | + except LatexConversionError: |
| 43 | + return 'processing-error' |
| 44 | + |
| 45 | + text_path = self.root / 'papers' / subpath / 'text.json' |
| 46 | + doc = PaperText.from_html(html, arxiv_id) |
| 47 | + text_path.parent.mkdir(parents=True, exist_ok=True) |
| 48 | + text_path.write_text(doc.to_json(), 'utf-8') |
| 49 | + |
| 50 | + tables_path = self.root / 'papers' / subpath |
| 51 | + tables = table_extraction.extract_tables(html) |
| 52 | + table_extraction.save_tables(tables, tables_path) |
| 53 | + |
| 54 | + return 'success' |
0 commit comments