Skip to content

Commit 9ab8a8e

Browse files
author
Marcin Kardas
authored
Merge pull request #6 from paperswithcode/clean
Clean repository from out-dated scripts
2 parents 74a1ae6 + 861ac54 commit 9ab8a8e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+228
-1863
lines changed

MANIFEST.in

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
include axcell/scripts/*
2+
include axcell/scripts/patches/*

Makefile

Lines changed: 0 additions & 83 deletions
This file was deleted.

axcell/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
goldtags_dump = data / "dumps" / "goldtags-2019.10.15_2227.json.gz"
1616

1717

18-
elastic = dict(hosts=['localhost'], timeout=20)
18+
elastic = dict(hosts=['127.0.0.1'], timeout=20)
1919
grobid = dict(host='grobid')
2020

2121
arxiv = data/'arxiv'

extract_tables.py renamed to axcell/data/extract_tables.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -348,7 +348,7 @@ def remove_footnotes(soup):
348348

349349

350350
def extract_tables(html):
351-
soup = BeautifulSoup(html, "lxml", from_encoding="utf-8")
351+
soup = BeautifulSoup(html, "lxml")
352352
set_ids_by_labels(soup)
353353
fix_span_tables(soup)
354354
fix_th(soup)

axcell/data/json.py

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,13 +71,62 @@ def cut(s, length=20):
7171
vals = pprint.pformat({to_snake_case(k): cut(str(self[k])) for k in self.keys()})
7272
return f"NodeWrap({vals})"
7373

74+
75+
def _annotations_to_gql(annotations):
76+
nodes = []
77+
for a in annotations:
78+
tables = []
79+
for t in a['tables']:
80+
tags = []
81+
if t['leaderboard']:
82+
tags.append('leaderboard')
83+
if t['ablation']:
84+
tags.append('ablation')
85+
if not tags:
86+
tags = ['irrelevant']
87+
88+
records = {}
89+
for r in t['records']:
90+
d = dict(r)
91+
del d['row']
92+
del d['column']
93+
records[f'{r["row"]}.{r["column"]}'] = d
94+
table = {
95+
'node': {
96+
'name': f'table_{t["index"] + 1:02}.csv',
97+
'datasetText': t['dataset_text'],
98+
'notes': '',
99+
'goldTags': ' '.join(tags),
100+
'matrixGoldTags': t['segmentation'],
101+
'cellsSotaRecords': json.dumps(records),
102+
'parser': 'latexml'
103+
}
104+
}
105+
tables.append(table)
106+
node = {
107+
'arxivId': a['arxiv_id'],
108+
'goldTags': a['fold'],
109+
'tableSet': {'edges': tables}
110+
}
111+
nodes.append({'node': node})
112+
return {
113+
'data': {
114+
'allPapers': {
115+
'edges': nodes
116+
}
117+
}
118+
}
119+
120+
74121
def load_gql_dump(data_or_file, compressed=True):
75-
if isinstance(data_or_file, dict):
122+
if isinstance(data_or_file, dict) or isinstance(data_or_file, list):
76123
papers_data = data_or_file
77124
else:
78125
open_fn = gzip.open if compressed else open
79126
with open_fn(data_or_file, "rt") as f:
80-
papers_data = json.load(f)
127+
papers_data = json.load(f)
128+
if "data" not in papers_data:
129+
papers_data = _annotations_to_gql(papers_data)
81130
data = papers_data["data"]
82131
return {k:wrap_dict(v) for k,v in data.items()}
83132

axcell/data/paper_collection.py

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from ..helpers.jupyter import display_table
1212
import string
1313
import random
14-
from extract_tables import extract_tables
14+
from axcell.data.extract_tables import extract_tables
1515

1616

1717
class Paper:
@@ -75,23 +75,32 @@ def _load_tables(path, annotations, jobs, migrate):
7575
return {f.parent.name: tbls for f, tbls in zip(files, tables)}
7676

7777

78+
def _gql_dump_to_annotations(dump):
79+
annotations = {remove_arxiv_version(a.arxiv_id): a for a in dump}
80+
annotations.update({a.arxiv_id: a for a in dump})
81+
return annotations
82+
7883
def _load_annotated_papers(data_or_path):
79-
if isinstance(data_or_path, dict):
84+
if isinstance(data_or_path, dict) or isinstance(data_or_path, list):
8085
compressed = False
8186
else:
8287
compressed = data_or_path.suffix == ".gz"
8388
dump = load_gql_dump(data_or_path, compressed=compressed)["allPapers"]
84-
annotations = {remove_arxiv_version(a.arxiv_id): a for a in dump}
85-
annotations.update({a.arxiv_id: a for a in dump})
86-
return annotations
89+
return _gql_dump_to_annotations(dump)
8790

8891

8992
class PaperCollection(UserList):
9093
def __init__(self, data=None):
9194
super().__init__(data)
9295

9396
@classmethod
94-
def from_files(cls, path, annotations_path=None, load_texts=True, load_tables=True, load_annotations=True, jobs=-1, migrate=False):
97+
def from_files(cls, path, annotations=None, load_texts=True, load_tables=True, jobs=-1):
98+
return cls._from_files(path, annotations=annotations, annotations_path=None,
99+
load_texts=load_texts, load_tables=load_tables, load_annotations=False,
100+
jobs=jobs)
101+
102+
@classmethod
103+
def _from_files(cls, path, annotations=None, annotations_path=None, load_texts=True, load_tables=True, load_annotations=True, jobs=-1, migrate=False):
95104
path = Path(path)
96105
if annotations_path is None:
97106
annotations_path = path / "structure-annotations.json"
@@ -102,7 +111,10 @@ def from_files(cls, path, annotations_path=None, load_texts=True, load_tables=Tr
102111
else:
103112
texts = {}
104113

105-
annotations = {}
114+
if annotations is None:
115+
annotations = {}
116+
else:
117+
annotations = _load_annotated_papers(annotations)
106118
if load_tables:
107119
if load_annotations:
108120
annotations = _load_annotated_papers(annotations_path)
@@ -131,8 +143,9 @@ def get_by_id(self, paper_id, ignore_version=True):
131143
def cells_gold_tags_legend(cls):
132144
tags = [
133145
("Tag", "description"),
134-
("model-best", "model that has results that author most likely would like to have exposed"),
135-
("model-paper", "an example of a generic model, (like LSTM)"),
146+
("model-best", "the best performing model introduced in the paper"),
147+
("model-paper", "model introduced in the paper"),
148+
("model-ensemble", "ensemble of models introduced in the paper"),
136149
("model-competing", "model from another paper used for comparison"),
137150
("dataset-task", "Task"),
138151
("dataset", "Dataset"),

axcell/helpers/datasets.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2+
3+
import pandas as pd
4+
5+
6+
def read_arxiv_papers(path):
7+
return pd.read_csv(path)
8+
9+
10+
def read_tables_annotations(path):
11+
return pd.read_json(path)

axcell/helpers/jupyter.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,4 +47,4 @@ def table_to_html(table, structure=None, layout=None, predictions=None, tooltips
4747

4848
def display_table(table, structure=None, layout=None):
4949
html = table_to_html(table, structure, layout)
50-
display_html("\n".join(html))
50+
display_html(html)

axcell/helpers/latex_converter.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,13 @@ def rw_bind(path): return dict(bind=path, mode='rw')
2121

2222

2323
class LatexConverter:
24-
def __init__(self, base_path):
24+
def __init__(self):
2525
# pull arxivvanity/engrafo image
2626
self.client = docker.from_env()
27-
self.base_path = Path(base_path)
27+
self._scripts_path = Path(__file__).resolve().parent.parent / 'scripts'
2828

2929
def latex2html(self, source_dir, output_dir, use_named_volumes=False):
30-
base = self.base_path
30+
base = self._scripts_path
3131
source_dir = Path(source_dir)
3232
output_dir = Path(output_dir)
3333
scriptname = "/files/latex2html.sh"

axcell/helpers/paper_extractor.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2+
3+
from pathlib import Path
4+
from axcell.helpers import LatexConverter, Unpack
5+
from axcell.errors import UnpackError, LatexConversionError
6+
from axcell.data.elastic import Paper as PaperText
7+
import axcell.data.extract_tables as table_extraction
8+
9+
import re
10+
import warnings
11+
12+
arxiv_re = re.compile(r"^(?P<arxiv_id>\d{4}\.\d+(v\d+)?)(\..*)?$")
13+
14+
15+
class PaperExtractor:
16+
def __init__(self, root):
17+
self.root = Path(root)
18+
self.unpack = Unpack()
19+
self.latex = LatexConverter()
20+
21+
def __call__(self, source):
22+
source = Path(source)
23+
24+
m = arxiv_re.match(source.name)
25+
if not m:
26+
warnings.warn(f'Unable to infer arxiv_id from "{source.name}" filename')
27+
arxiv_id = source.name
28+
else:
29+
arxiv_id = m.group('arxiv_id')
30+
31+
subpath = source.relative_to(self.root / 'sources').parent / arxiv_id
32+
unpack_path = self.root / 'unpacked_sources' / subpath
33+
try:
34+
self.unpack(source, unpack_path)
35+
except UnpackError as e:
36+
if e.message.startswith('The paper has been withdrawn'):
37+
return 'withdrawn'
38+
return 'no-tex'
39+
html_path = self.root / 'htmls' / subpath / 'index.html'
40+
try:
41+
html = self.latex.to_html(unpack_path)
42+
html_path.parent.mkdir(parents=True, exist_ok=True)
43+
html_path.write_text(html, 'utf-8')
44+
except LatexConversionError:
45+
return 'processing-error'
46+
47+
text_path = self.root / 'papers' / subpath / 'text.json'
48+
doc = PaperText.from_html(html, arxiv_id)
49+
text_path.parent.mkdir(parents=True, exist_ok=True)
50+
text_path.write_text(doc.to_json(), 'utf-8')
51+
52+
tables_path = self.root / 'papers' / subpath
53+
tables = table_extraction.extract_tables(html)
54+
table_extraction.save_tables(tables, tables_path)
55+
56+
return 'success'

0 commit comments

Comments
 (0)