5
5
from elasticsearch_dsl import connections
6
6
from tqdm import tqdm
7
7
import pandas as pd
8
+ from joblib import delayed , Parallel
8
9
9
10
class Helper :
10
11
def split_pc_pickle (self , path , outdir = "pc-parts" , parts = 8 ):
@@ -16,18 +17,22 @@ def split_pc_pickle(self, path, outdir="pc-parts", parts=8):
16
17
part = PaperCollection (pc [i :i + step ])
17
18
part .to_pickle (outdir / f"pc-part-{ idx :02} .pkl" )
18
19
19
- def evidences_for_pc (self , path ):
20
+ def _evidences_for_pc (self , path ):
20
21
path = Path (path )
21
22
pc = PaperCollection .from_pickle (path )
22
23
cell_evidences = CellEvidenceExtractor ()
23
24
connections .create_connection (hosts = ['10.0.1.145' ], timeout = 20 )
24
25
raw_evidences = []
25
26
for paper in tqdm (pc ):
26
- raw_evidences .append (cell_evidences (paper , paper .tables ))
27
+ raw_evidences .append (cell_evidences (paper , paper .tables , paper_limit = 100 , corpus_limit = 20 ))
27
28
raw_evidences = pd .concat (raw_evidences )
28
29
path = path .with_suffix (".evidences.pkl" )
29
30
raw_evidences .to_pickle (path )
30
31
32
+ def evidences_for_pc (self , pattern = "pc-parts/pc-part-??.pkl" , jobs = - 1 ):
33
+ pickles = sorted (Path ("." ).glob (pattern ))
34
+ Parallel (backend = "multiprocessing" , n_jobs = jobs )(delayed (self ._evidences_for_pc )(path ) for path in pickles )
35
+
31
36
def merge_evidences (self , output = "evidences.pkl" , pattern = "pc-parts/pc-part-*.evidences.pkl" ):
32
37
pickles = sorted (Path ("." ).glob (pattern ))
33
38
evidences = [pd .read_pickle (pickle ) for pickle in pickles ]
0 commit comments