3
3
from .json import load_gql_dump
4
4
from pathlib import Path
5
5
import re
6
+ from tqdm import tqdm
7
+ from joblib import Parallel , delayed
6
8
7
9
class Paper :
8
10
def __init__ (self , text , tables , annotations ):
@@ -15,11 +17,11 @@ def __init__(self, text, tables, annotations):
15
17
16
18
17
19
arxiv_version_re = re .compile (r"v\d+$" )
18
- def clean_arxiv_version (arxiv_id ):
20
+ def clear_arxiv_version (arxiv_id ):
19
21
return arxiv_version_re .sub ("" , arxiv_id )
20
22
21
23
22
- class PaperCollection :
24
+ class PaperCollection ( dict ) :
23
25
def __init__ (self , path , load_texts = True , load_tables = True ):
24
26
self .path = path
25
27
self .load_texts = load_texts
@@ -50,27 +52,20 @@ def __iter__(self):
50
52
return iter (self ._papers )
51
53
52
54
def _load_texts (self ):
53
- texts = {}
54
-
55
- for f in (self .path / "texts" ).glob ("**/*.json" ):
56
- text = PaperText .from_file (f )
57
- texts [clean_arxiv_version (text .meta .id )] = text
58
- return texts
55
+ files = list ((self .path / "texts" ).glob ("**/*.json" ))
56
+ texts = Parallel (n_jobs = - 1 , prefer = "processes" )(delayed (PaperText .from_file )(f ) for f in files )
57
+ return {clear_arxiv_version (text .meta .id ): text for text in texts }
59
58
60
59
61
60
def _load_tables (self , annotations ):
62
- tables = {}
63
-
64
- for f in (self .path / "tables" ).glob ("**/metadata.json" ):
65
- paper_dir = f .parent
66
- tbls = read_tables (paper_dir , annotations )
67
- tables [clean_arxiv_version (paper_dir .name )] = tbls
68
- return tables
61
+ files = list ((self .path / "tables" ).glob ("**/metadata.json" ))
62
+ tables = Parallel (n_jobs = - 1 , prefer = "processes" )(delayed (read_tables )(f .parent , annotations ) for f in files )
63
+ return {clear_arxiv_version (f .parent .name ): tbls for f , tbls in zip (files , tables )}
69
64
70
65
def _load_annotated_papers (self ):
71
66
dump = load_gql_dump (self .path / "structure-annotations.json.gz" , compressed = True )["allPapers" ]
72
67
annotations = {}
73
68
for a in dump :
74
- arxiv_id = clean_arxiv_version (a .arxiv_id )
69
+ arxiv_id = clear_arxiv_version (a .arxiv_id )
75
70
annotations [arxiv_id ] = a
76
71
return annotations
0 commit comments