Skip to content

Commit 92cc525

Browse files
committed
Process evidences in parallel
1 parent 6bbf053 commit 92cc525

File tree

1 file changed

+7
-2
lines changed

1 file changed

+7
-2
lines changed

helpers.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from elasticsearch_dsl import connections
66
from tqdm import tqdm
77
import pandas as pd
8+
from joblib import delayed, Parallel
89

910
class Helper:
1011
def split_pc_pickle(self, path, outdir="pc-parts", parts=8):
@@ -16,18 +17,22 @@ def split_pc_pickle(self, path, outdir="pc-parts", parts=8):
1617
part = PaperCollection(pc[i:i + step])
1718
part.to_pickle(outdir / f"pc-part-{idx:02}.pkl")
1819

19-
def evidences_for_pc(self, path):
20+
def _evidences_for_pc(self, path):
2021
path = Path(path)
2122
pc = PaperCollection.from_pickle(path)
2223
cell_evidences = CellEvidenceExtractor()
2324
connections.create_connection(hosts=['10.0.1.145'], timeout=20)
2425
raw_evidences = []
2526
for paper in tqdm(pc):
26-
raw_evidences.append(cell_evidences(paper, paper.tables))
27+
raw_evidences.append(cell_evidences(paper, paper.tables, paper_limit=100, corpus_limit=20))
2728
raw_evidences = pd.concat(raw_evidences)
2829
path = path.with_suffix(".evidences.pkl")
2930
raw_evidences.to_pickle(path)
3031

32+
def evidences_for_pc(self, pattern="pc-parts/pc-part-??.pkl", jobs=-1):
33+
pickles = sorted(Path(".").glob(pattern))
34+
Parallel(backend="multiprocessing", n_jobs=jobs)(delayed(self._evidences_for_pc)(path) for path in pickles)
35+
3136
def merge_evidences(self, output="evidences.pkl", pattern="pc-parts/pc-part-*.evidences.pkl"):
3237
pickles = sorted(Path(".").glob(pattern))
3338
evidences = [pd.read_pickle(pickle) for pickle in pickles]

0 commit comments

Comments
 (0)