Skip to content

Commit 29189fa

Browse files
committed
Add a helper script to cache evidences
1 parent 0dea0c9 commit 29189fa

File tree

1 file changed

+38
-0
lines changed

1 file changed

+38
-0
lines changed

helpers.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
from fire import Fire
2+
from pathlib import Path
3+
from sota_extractor2.data.paper_collection import PaperCollection
4+
from sota_extractor2.data.structure import CellEvidenceExtractor
5+
from elasticsearch_dsl import connections
6+
from tqdm import tqdm
7+
import pandas as pd
8+
9+
class Helper:
10+
def split_pc_pickle(self, path, outdir="pc-parts", parts=8):
11+
outdir = Path(outdir)
12+
outdir.mkdir(parents=True, exist_ok=True)
13+
pc = PaperCollection.from_pickle(path)
14+
step = (len(pc) + parts - 1) // parts
15+
for idx, i in enumerate(range(0, len(pc), step)):
16+
part = PaperCollection(pc[i:i + step])
17+
part.to_pickle(outdir / f"pc-part-{idx:02}.pkl")
18+
19+
def evidences_for_pc(self, path):
20+
path = Path(path)
21+
pc = PaperCollection.from_pickle(path)
22+
cell_evidences = CellEvidenceExtractor()
23+
connections.create_connection(hosts=['10.0.1.145'], timeout=20)
24+
raw_evidences = []
25+
for paper in tqdm(pc):
26+
raw_evidences.append(cell_evidences(paper, paper.tables))
27+
raw_evidences = pd.concat(raw_evidences)
28+
path = path.with_suffix(".evidences.pkl")
29+
raw_evidences.to_pickle(path)
30+
31+
def merge_evidences(self, output="evidences.pkl", pattern="pc-parts/pc-part-*.evidences.pkl"):
32+
pickles = sorted(Path(".").glob(pattern))
33+
evidences = [pd.read_pickle(pickle) for pickle in pickles]
34+
evidences = pd.concat(evidences)
35+
evidences.to_pickle(output)
36+
37+
38+
if __name__ == "__main__": Fire(Helper())

0 commit comments

Comments
 (0)