Skip to content

Commit d32e252

Browse files
author
Marcin Kardas
committed
Add PaperCollection
* add metadata when loading elastic documents from jsons * load paper's text, tables and annotations
1 parent 4852ff6 commit d32e252

File tree

6 files changed

+175
-5
lines changed

6 files changed

+175
-5
lines changed

extract_texts.py

100644100755
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#!/usr/bin/env python
2+
13
import fire
24
from sota_extractor2.data.elastic import Paper
35
from pathlib import Path

sota_extractor2/data/elastic.py

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
from sota_extractor2.data.doc_utils import get_text, content_in_section, group_content, set_ids_by_labels, read_html
1414
from .. import config
15+
from pathlib import Path
1516

1617

1718
def setup_default_connection():
@@ -108,6 +109,20 @@ class Fragment(Document):
108109
class Index:
109110
name = 'paper-fragments'
110111

112+
@classmethod
113+
def from_json(cls, json):
114+
if isinstance(json, str):
115+
source = serializer.loads(json)
116+
else:
117+
source = json
118+
data = dict(
119+
_source = source,
120+
_id = f"{source['paper_id']}_{source['order']}",
121+
_index = 'paper-fragments',
122+
_type = 'doc')
123+
return cls.from_es(data)
124+
125+
111126
def __repr__(self):
112127
return f"# {self.header},\n" \
113128
f"{self.text}" \
@@ -125,7 +140,34 @@ class Index:
125140
name = 'papers'
126141

127142
def to_json(self):
128-
return serializer.dumps(self.to_dict())
143+
data = self.to_dict()
144+
return serializer.dumps(d)
145+
146+
@classmethod
147+
def from_json(cls, json, paper_id=None):
148+
if isinstance(json, str):
149+
source = serializer.loads(json)
150+
else:
151+
source = json
152+
fragments = source.pop('fragments', [])
153+
data = dict(
154+
_source = source,
155+
_index = 'papers',
156+
_type = 'doc')
157+
if paper_id is not None:
158+
data['_id'] = paper_id
159+
160+
paper = cls.from_es(data)
161+
paper.fragments = Fragments([Fragment.from_json(f) for f in fragments])
162+
return paper
163+
164+
@classmethod
165+
def from_file(cls, path):
166+
path = Path(path)
167+
paper_id = path.stem
168+
with open(path, "rt") as f:
169+
json = f.read()
170+
return cls.from_json(json, paper_id)
129171

130172
def to_df(self):
131173
return pd.DataFrame({'header': [f.header for f in self.fragments],

sota_extractor2/data/json.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,11 +69,12 @@ def cut(s, length=20):
6969
vals = pprint.pformat({to_snake_case(k): cut(str(self[k])) for k in self.keys()})
7070
return f"NodeWrap({vals})"
7171

72-
def load_gql_dump(data_or_file):
72+
def load_gql_dump(data_or_file, compressed=True):
7373
if isinstance(data_or_file, dict):
7474
papers_data = data_or_file
7575
else:
76-
with gzip.open(data_or_file, "rb") as f:
76+
open_fn = gzip.open if compressed else open
77+
with open_fn(data_or_file, "rt") as f:
7778
papers_data = json.load(f)
7879
data = papers_data["data"]
7980
return {k:wrap_dict(v) for k,v in data.items()}
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
from .elastic import Paper as PaperText
2+
from .table import Table, read_tables
3+
from .json import load_gql_dump
4+
from pathlib import Path
5+
import re
6+
7+
class Paper:
8+
def __init__(self, text, tables, annotations):
9+
self.text = text
10+
self.tables = tables
11+
if annotations is not None:
12+
self.gold_tags = annotations.gold_tags.strip()
13+
else:
14+
self.gold_tags = ''
15+
16+
17+
arxiv_version_re = re.compile(r"v\d+$")
18+
def clean_arxiv_version(arxiv_id):
19+
return arxiv_version_re.sub("", arxiv_id)
20+
21+
22+
class PaperCollection:
23+
def __init__(self, path, load_texts=True, load_tables=True):
24+
self.path = path
25+
self.load_texts = load_texts
26+
self.load_tables = load_tables
27+
28+
if self.load_texts:
29+
texts = self._load_texts()
30+
else:
31+
texts = {}
32+
33+
annotations = self._load_annotated_papers()
34+
if self.load_tables:
35+
tables = self._load_tables(annotations)
36+
else:
37+
tables = {}
38+
annotations = {}
39+
outer_join = set(texts).union(set(tables))
40+
41+
self._papers = {k: Paper(texts.get(k), tables.get(k), annotations.get(k)) for k in outer_join}
42+
43+
def __len__(self):
44+
return len(self._papers)
45+
46+
def __getitem__(self, idx):
47+
return self._papers[idx]
48+
49+
def __iter__(self):
50+
return iter(self._papers)
51+
52+
def _load_texts(self):
53+
texts = {}
54+
55+
for f in (self.path / "texts").glob("**/*.json"):
56+
text = PaperText.from_file(f)
57+
texts[clean_arxiv_version(text.meta.id)] = text
58+
return texts
59+
60+
61+
def _load_tables(self, annotations):
62+
tables = {}
63+
64+
for f in (self.path / "tables").glob("**/metadata.json"):
65+
paper_dir = f.parent
66+
tbls = read_tables(paper_dir, annotations)
67+
tables[clean_arxiv_version(paper_dir.name)] = tbls
68+
return tables
69+
70+
def _load_annotated_papers(self):
71+
dump = load_gql_dump(self.path / "structure-annotations.json.gz", compressed=True)["allPapers"]
72+
annotations = {}
73+
for a in dump:
74+
arxiv_id = clean_arxiv_version(a.arxiv_id)
75+
annotations[arxiv_id] = a
76+
return annotations

sota_extractor2/data/table.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import pandas as pd
2+
import json
3+
from pathlib import Path
4+
from dataclasses import dataclass
5+
from typing import List
6+
from ..helpers.jupyter import display_table
7+
8+
@dataclass
9+
class Cell:
10+
value: str
11+
gold_tags: str = ''
12+
refs: List[str] = None
13+
14+
15+
class Table:
16+
def __init__(self, df, caption=None, figure_id=None, annotations=None):
17+
self.df = df
18+
self.caption = caption
19+
self.figure_id = figure_id
20+
self.df = df.applymap(lambda x: Cell(value=x))
21+
if annotations is not None:
22+
self.gold_tags = annotations.gold_tags.strip()
23+
rows, cols = annotations.matrix_gold_tags.shape
24+
for r in range(rows):
25+
for c in range(cols):
26+
self.df.iloc[r,c].gold_tags = annotations.matrix_gold_tags.iloc[r,c].strip()
27+
else:
28+
self.gold_tags = ''
29+
30+
@classmethod
31+
def from_file(cls, path, metadata, annotations=None):
32+
try:
33+
df = pd.read_csv(path, header=None, dtype=str).fillna('')
34+
except pd.errors.EmptyDataError:
35+
df = pd.DataFrame()
36+
return cls(df, metadata.get('caption'), metadata.get('figure_id'), annotations)
37+
38+
def display(self):
39+
40+
display_table(self.df.applymap(lambda x: x.value).values, self.df.applymap(lambda x: x.gold_tags).values)
41+
42+
def read_tables(path, annotations):
43+
path = Path(path)
44+
with open(path / "metadata.json", "r") as f:
45+
metadata = json.load(f)
46+
return [Table.from_file(path / m["filename"], m, annotations.get(path.name)) for m in metadata]

sota_extractor2/helpers/jupyter.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,10 @@ def display_table(table, structure=None):
1616
matrix - 2d ndarray with cell values
1717
strucutre - 2d ndarray with structure annotation
1818
"""
19-
matrix = table.matrix
19+
if hasattr(table, 'matrix'):
20+
matrix = table.matrix
21+
else:
22+
matrix = table
2023
if structure is None: structure = table.matrix_gold_tags
2124
html = []
2225
html.append('<link href="http://10.0.1.145:8001/static/css/main.bd3d2d63.chunk.css" rel="stylesheet">')
@@ -29,4 +32,4 @@ def display_table(table, structure=None):
2932
html.append("</tr>")
3033
html.append("</table>")
3134
html.append('</div>')
32-
display_html("\n".join(html))
35+
display_html("\n".join(html))

0 commit comments

Comments
 (0)