Skip to content

Commit 281c907

Browse files
author
Marcin Kardas
committed
Add the datasets and extractor helpers
1 parent f9d90f0 commit 281c907

File tree

2 files changed

+62
-0
lines changed

2 files changed

+62
-0
lines changed

axcell/helpers/datasets.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import pandas as pd
2+
3+
def read_arxiv_papers(path):
4+
return pd.read_csv(path)
5+
6+
def read_tables(path):
7+
return pd.read_json(path)
8+

axcell/helpers/paper_extractor.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
from pathlib import Path
2+
from axcell.helpers import LatexConverter, Unpack
3+
from axcell.errors import UnpackError, LatexConversionError
4+
from axcell.data.elastic import Paper as PaperText
5+
import axcell.data.extract_tables as table_extraction
6+
7+
import re
8+
import warnings
9+
10+
arxiv_re = re.compile(r"^(?P<arxiv_id>\d{4}\.\d+(v\d+)?)(\..*)?$")
11+
12+
13+
class PaperExtractor:
14+
def __init__(self, root):
15+
self.root = Path(root)
16+
self.unpack = Unpack()
17+
self.latex = LatexConverter()
18+
19+
def __call__(self, source):
20+
source = Path(source)
21+
22+
m = arxiv_re.match(source.name)
23+
if not m:
24+
warnings.warn(f'Unable to infer arxiv_id from "{source.name}" filename')
25+
arxiv_id = source.name
26+
else:
27+
arxiv_id = m.group('arxiv_id')
28+
29+
subpath = source.relative_to(self.root / 'sources').parent / arxiv_id
30+
unpack_path = self.root / 'unpacked_sources' / subpath
31+
try:
32+
self.unpack(source, unpack_path)
33+
except UnpackError as e:
34+
if e.message.startswith('The paper has been withdrawn'):
35+
return 'withdrawn'
36+
return 'no-tex'
37+
html_path = self.root / 'htmls' / subpath / 'index.html'
38+
try:
39+
html = self.latex.to_html(unpack_path)
40+
html_path.parent.mkdir(parents=True, exist_ok=True)
41+
html_path.write_text(html, 'utf-8')
42+
except LatexConversionError:
43+
return 'processing-error'
44+
45+
text_path = self.root / 'papers' / subpath / 'text.json'
46+
doc = PaperText.from_html(html, arxiv_id)
47+
text_path.parent.mkdir(parents=True, exist_ok=True)
48+
text_path.write_text(doc.to_json(), 'utf-8')
49+
50+
tables_path = self.root / 'papers' / subpath
51+
tables = table_extraction.extract_tables(html)
52+
table_extraction.save_tables(tables, tables_path)
53+
54+
return 'success'

0 commit comments

Comments
 (0)