1
1
import re
2
2
import json
3
3
from pathlib import Path
4
- from sota_extractor2 .data .elastic import Reference2
4
+ from collections import Counter
5
+ from sota_extractor2 .data .elastic import Reference2 , setup_default_connection
5
6
from sota_extractor2 .data .references import PReference , PAuthor , ReferenceStore
6
7
from tqdm import tqdm
7
8
from elasticsearch .helpers import bulk
12
13
# required for bulk saving
13
14
http .client ._MAXHEADERS = 1000
14
15
15
- # papers_path = Path("/tmp/papers/papers-with-abstracts.json")
16
- papers_path = Path ("/tmp/papers/papers-with-abstracts-duplicates.json" )
16
+ setup_default_connection ()
17
+
18
+ papers_path = Path ("/data/dblp/papers/papers-with-abstracts.json" )
17
19
18
20
19
21
def read_pwc_papers (path ):
@@ -54,23 +56,27 @@ def from_paper_dict(paper):
54
56
)
55
57
56
58
59
+ def _text (elem ): return "" .join (elem .itertext ())
60
+
61
+
57
62
def from_paper_elem (elem ):
58
- authors_str = [a .text .strip () for a in elem .findall ("author" ) if a .text .strip ()]
63
+ authors_str = [_text (a ).strip () for a in elem .findall ("author" )]
64
+ authors_str = [s for s in authors_str if s ]
59
65
authors = [PAuthor .from_fullname (a ) for a in authors_str ]
60
66
arxiv_id = None
61
67
url = None
62
68
for ee in elem .findall ("ee" ):
63
69
if url is None or "oa" in ee .attrib : # prefere open access urls
64
- url = ee . text
65
- m = arxiv_url_only_re .match (ee . text )
70
+ url = _text ( ee )
71
+ m = arxiv_url_only_re .match (_text ( ee ) )
66
72
if m :
67
- url = ee . text
73
+ url = _text ( ee ) # prefere arxiv urls
68
74
arxiv_id = m .group ("arxiv_id" )
69
75
break
70
76
title = None
71
77
title_elem = elem .find ("title" )
72
78
if title_elem is not None :
73
- title = title_elem . text .rstrip (" ." )
79
+ title = _text ( title_elem ) .rstrip (" ." )
74
80
return PReference (
75
81
title = title ,
76
82
authors = authors ,
@@ -80,9 +86,14 @@ def from_paper_elem(elem):
80
86
)
81
87
82
88
83
- def merge_references (p_references ):
89
+ def merge_references (p_references , elastic_references ):
90
+ uids = Counter ([p_ref .unique_id () for p_ref in p_references ])
84
91
for p_ref in tqdm (p_references ):
85
92
uid = p_ref .unique_id ()
93
+ # ignore papers with too common title
94
+ # (often these are "Editorial", "Preface", "Letter")
95
+ if uids [uid ] > 5 :
96
+ continue
86
97
e_ref = elastic_references .get (uid )
87
98
if not e_ref :
88
99
e_ref = Reference2 .from_ref (p_ref )
@@ -94,6 +105,19 @@ def save_all(docs):
94
105
bulk (connections .get_connection (), (d .to_dict (True ) for d in docs ), chunk_size = 500 )
95
106
96
107
108
+ def get_elastic_references (unique_ids , chunk_size = 1000 ):
109
+ elastic_references = {}
110
+ i = 0
111
+ while i < len (unique_ids ):
112
+ ids = unique_ids [i :i + chunk_size ]
113
+ i += chunk_size
114
+ elastic_references .update ({
115
+ uid : ref for uid , ref in zip (ids , Reference2 .mget (ids ))
116
+ if ref
117
+ })
118
+ return elastic_references
119
+
120
+
97
121
def init_pwc ():
98
122
# read list of ML papers (titles, abstracts, arxiv ids, etc.)
99
123
all_papers = read_pwc_papers (papers_path )
@@ -105,29 +129,26 @@ def init_pwc():
105
129
p_references = [ref for ref in p_references if ref .unique_id ()]
106
130
107
131
all_ids = list (set (ref .unique_id () for ref in p_references ))
108
- elastic_references = {
109
- uid : ref for uid , ref in zip (all_ids , Reference2 .mget (all_ids ))
110
- if ref
111
- }
112
-
113
- merge_references (p_references )
132
+ elastic_references = get_elastic_references (all_ids )
133
+ merge_references (p_references , elastic_references )
114
134
save_all (elastic_references .values ())
115
135
116
136
117
137
def init_dblp ():
118
- dblp_xml = ET .parse (str (Path .home () / "data" / "dblp" / "dblp-10k-noent.xml" ))
138
+ dblp_xml = ET .parse (str (Path ("/data" ) / "dblp" / "dblp-noent.xml" ))
139
+ #dblp_xml = ET.parse(str(Path("/data") / "dblp" / "dblp-small-noent.xml"))
119
140
root = dblp_xml .getroot ()
120
- p_references = [from_paper_elem (elem ) for elem in root . getchildren () ]
141
+ p_references = [from_paper_elem (elem ) for elem in root ]
121
142
p_references = [ref for ref in p_references if ref .unique_id ()]
122
143
123
144
all_ids = list (set (ref .unique_id () for ref in p_references ))
124
- elastic_references = {
125
- uid : ref for uid , ref in zip (all_ids , Reference2 .mget (all_ids ))
126
- if ref
127
- }
145
+ # todo: add references2 index initialization
146
+ elastic_references = {} #get_elastic_references(all_ids)
128
147
129
- merge_references (p_references )
148
+ merge_references (p_references , elastic_references )
130
149
save_all (elastic_references .values ())
131
150
151
+ # Reference2._index.delete()
152
+ Reference2 .init ()
132
153
init_dblp ()
133
- # init_pwc()
154
+ init_pwc ()
0 commit comments