Skip to content

Commit 7dac06c

Browse files
committed
Updates and testing fixes.
Changed the code so there is 2 passes over the queries and we try to merge data from different sources.
1 parent 4776d06 commit 7dac06c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

53 files changed

+273323
-777
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,5 @@ dist/
1212
coverage.xml
1313
.coverage
1414
htmlcov/
15-
README_old.rst
15+
README_old.rst
16+
testing_scratch/

docs/todo.rst

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,14 @@ Add PMCID and grants to pymed package.
1616

1717
Add expanded search to orcid package or look for more up to date package to use. Expanded search was added to ORCID's API with 3.0 release. orcid package appears to be 2.0 only.
1818

19-
Add capability to get the citations each paper cites.
19+
Add capability to get the citations each paper cites.
20+
21+
Switch to a merge style from each source, so try to fill in information that wasn't found previously.
22+
Keep the queries from each source, and do 2 passes with the new merge logic. This makes it so that if
23+
a publication was on PubMed, but an author couldn't be matched, but an author was matched at another
24+
source we can merge the information with the second pass. Would need to change the logic to first look
25+
and see if the publication is in the list already and if it is then we don't need to make an author match
26+
because an author match was made from another source. The big changes are to keep the queries, do a second
27+
pass, merge information, and use existence in the list as verification in addition to an author match.
28+
29+
Save references out in "citation" format. Look at formats Google Scholar offers, example EndNote.

src/academic_tracker/__main__.py

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,21 @@
22
Usage:
33
academic_tracker author_search <config_json_file> [--test]
44
[--prev_pub=<file-path> --prev-pub=<file-path>]
5+
[--save-all-queries]
56
[--no-GoogleScholar --no_GoogleScholar]
67
[--no-ORCID --no_ORCID]
78
[--no-Crossref --no_Crossref]
89
[--no-PubMed --no_PubMed]
10+
[--citation-match-ratio=<0-100>]
911
[--verbose --silent]
1012
academic_tracker reference_search <config_json_file> <references_file_or_URL> [--test]
1113
[--prev-pub=<file-path> --prev_pub=<file-path>]
14+
[--save-all-queries]
1215
[--PMID-reference --PMID_reference]
1316
[--MEDLINE-reference --MEDLINE_reference]
1417
[--no-Crossref --no_Crossref]
1518
[--no-PubMed --no_PubMed]
19+
[--citation-match-ratio=<0-100>]
1620
[--verbose --silent]
1721
academic_tracker find_ORCID <config_json_file> [--verbose --silent]
1822
academic_tracker find_Google_Scholar <config_json_file> [--verbose --silent]
@@ -34,6 +38,8 @@
3438
--prev-pub=<file-path> Filepath to json or csv with publication ids to ignore.
3539
Enter "ignore" for the <file_path> to not look for previous publications.json files in tracker directories.
3640
--prev_pub=<file-path> Deprecated. Use --prev-pub instead.
41+
--save-all-queries Save all queried results from each source in "all_results.json".
42+
--citation-match-ratio=<num> An integer from 0-100. The threshold to consider 2 citations the same using fuzzy matching. Lower is more forgiving. [default: 65]
3743
3844
Reference Type Options:
3945
--PMID-reference Indicates that the reference_file is a PMID file and only PubMed info will be returned.
@@ -105,7 +111,9 @@ def main():
105111
args["--no_Crossref"] or args["--no-Crossref"],
106112
args["--no_PubMed"] or args["--no-PubMed"],
107113
args["--test"],
108-
args["--prev-pub"] if args["--prev-pub"] else args["--prev_pub"])
114+
args["--prev-pub"] if args["--prev-pub"] else args["--prev_pub"],
115+
args["--save-all-queries"],
116+
args["--citation-match-ratio"])
109117
elif len(sys.argv) > 1 and sys.argv[1] == "reference_search":
110118
if args["--PMID_reference"] or args["--PMID-reference"]:
111119
PMID_reference(args["<config_json_file>"], args["<references_file_or_URL>"], args["--test"])
@@ -116,7 +124,9 @@ def main():
116124
args["--no_Crossref"] or args["--no-Crossref"],
117125
args["--no_PubMed"] or args["--no-PubMed"],
118126
args["--test"],
119-
args["--prev-pub"] if args["--prev-pub"] else args["--prev_pub"])
127+
args["--prev-pub"] if args["--prev-pub"] else args["--prev_pub"],
128+
args["--save-all-queries"],
129+
args["--citation-match-ratio"])
120130
elif len(sys.argv) > 1 and sys.argv[1] == "find_ORCID":
121131
find_ORCID(args["<config_json_file>"])
122132
elif len(sys.argv) > 1 and sys.argv[1] == "find_Google_Scholar":
@@ -140,7 +150,8 @@ def main():
140150

141151

142152

143-
def author_search(config_json_filepath, no_ORCID, no_GoogleScholar, no_Crossref, no_PubMed, test, prev_pub_filepath):
153+
def author_search(config_json_filepath, no_ORCID, no_GoogleScholar, no_Crossref, no_PubMed,
154+
test, prev_pub_filepath, save_all_results, citation_match_ratio):
144155
"""Query sources for publications by author.
145156
146157
Reads in the JSON config file, previous publications JSON file, and checks for errors.
@@ -157,9 +168,12 @@ def author_search(config_json_filepath, no_ORCID, no_GoogleScholar, no_Crossref,
157168
no_PubMed (bool): If True search PubMed else don't. Reduces checking on config JSON if True.
158169
test (bool): If True save_dir_name is tracker-test instead of tracker- and emails are not sent.
159170
prev_pub_filepath (str or None): filepath to the publication JSON to read in.
171+
save_all_results (bool): if True, save all of the queried publications from each source as "all_results.json"
172+
citation_match_ratio (int): if the fuzzy ratio between 2 citations is greater than or equal to this, then consider them to match.
160173
"""
161174

162-
config_dict = athr_srch_modularized.input_reading_and_checking(config_json_filepath, no_ORCID, no_GoogleScholar, no_Crossref, no_PubMed)
175+
config_dict = athr_srch_modularized.input_reading_and_checking(config_json_filepath, no_ORCID, no_GoogleScholar,
176+
no_Crossref, no_PubMed, citation_match_ratio)
163177

164178
## Create an authors_json for each project in the config_dict and update those authors attributes with the project attributes.
165179
authors_by_project_dict, config_dict = athr_srch_modularized.generate_internal_data_and_check_authors(config_dict)
@@ -170,18 +184,22 @@ def author_search(config_json_filepath, no_ORCID, no_GoogleScholar, no_Crossref,
170184
user_input_checking.prev_pubs_file_check(prev_pubs)
171185

172186
## Query sources and build publication_dict.
173-
publication_dict = athr_srch_modularized.build_publication_dict(config_dict, prev_pubs, no_ORCID, no_GoogleScholar, no_Crossref, no_PubMed)
187+
publication_dict, all_queries = athr_srch_modularized.build_publication_dict(config_dict, prev_pubs, no_ORCID, no_GoogleScholar, no_Crossref, no_PubMed, citation_match_ratio)
174188

175189
save_dir_name = athr_srch_modularized.save_and_send_reports_and_emails(authors_by_project_dict, publication_dict, config_dict, test)
176190

177191
## combine previous and new publications lists and save
178192
fileio.save_publications_to_file(save_dir_name, publication_dict, prev_pubs)
179193

194+
if save_all_results:
195+
fileio.save_json_to_file(save_dir_name, "all_results.json", all_queries)
196+
180197
helper_functions.vprint("Success. Publications, reports, and emails saved in " + save_dir_name)
181198

182199

183200

184-
def reference_search(config_json_filepath, ref_path_or_URL, MEDLINE_reference, no_Crossref, no_PubMed, test, prev_pub_filepath):
201+
def reference_search(config_json_filepath, ref_path_or_URL, MEDLINE_reference, no_Crossref, no_PubMed,
202+
test, prev_pub_filepath, save_all_results, citation_match_ratio):
185203
"""Query PubMed and Crossref for publications matching a reference.
186204
187205
Read in user inputs and check for error, query sources based on inputs, build
@@ -195,17 +213,22 @@ def reference_search(config_json_filepath, ref_path_or_URL, MEDLINE_reference, n
195213
no_PubMed (bool): If True search PubMed else don't. Reduces checking on config JSON if True.
196214
test (bool): If True save_dir_name is tracker-test instead of tracker- and emails are not sent.
197215
prev_pub_filepath (str or None): filepath to the publication JSON to read in.
216+
save_all_results (bool): if True, save all of the queried publications from each source as "all_results.json"
217+
citation_match_ratio (int): if the fuzzy ratio between 2 citations is greater than or equal to this, then consider them to match.
198218
"""
199219

200-
config_dict, tokenized_citations, has_previous_pubs, prev_pubs = ref_srch_modularized.input_reading_and_checking(config_json_filepath, ref_path_or_URL, MEDLINE_reference, no_Crossref, no_PubMed, prev_pub_filepath)
220+
config_dict, tokenized_citations, has_previous_pubs, prev_pubs = ref_srch_modularized.input_reading_and_checking(config_json_filepath, ref_path_or_URL, MEDLINE_reference, no_Crossref, no_PubMed, prev_pub_filepath, citation_match_ratio)
201221

202-
publication_dict, tokenized_citations = ref_srch_modularized.build_publication_dict(config_dict, tokenized_citations, no_Crossref, no_PubMed)
222+
publication_dict, tokenized_citations, all_queries = ref_srch_modularized.build_publication_dict(config_dict, tokenized_citations, no_Crossref, no_PubMed, citation_match_ratio)
203223

204224
save_dir_name = ref_srch_modularized.save_and_send_reports_and_emails(config_dict, tokenized_citations, publication_dict, prev_pubs, has_previous_pubs, test)
205225

206226
fileio.save_publications_to_file(save_dir_name, publication_dict, {})
207227
fileio.save_json_to_file(save_dir_name, "tokenized_reference.json", tokenized_citations)
208228

229+
if save_all_results:
230+
fileio.save_json_to_file(save_dir_name, "all_results.json", all_queries)
231+
209232
helper_functions.vprint("Success. Publications and reports saved in " + save_dir_name)
210233

211234

src/academic_tracker/athr_srch_modularized.py

Lines changed: 40 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from . import webio
2121

2222

23-
def input_reading_and_checking(config_json_filepath, no_ORCID, no_GoogleScholar, no_Crossref, no_PubMed):
23+
def input_reading_and_checking(config_json_filepath, no_ORCID, no_GoogleScholar, no_Crossref, no_PubMed, citation_match_ratio):
2424
"""Read in inputs from user and do error checking.
2525
2626
Args:
@@ -29,10 +29,17 @@ def input_reading_and_checking(config_json_filepath, no_ORCID, no_GoogleScholar,
2929
no_GoogleScholar (bool): if True search Google Scholar else don't. Reduces checking on config JSON if True.
3030
no_Crossref (bool): If True search Crossref else don't. Reduces checking on config JSON if True.
3131
no_PubMed (bool): If True search PubMed else don't. Reduces checking on config JSON if True.
32+
citation_match_ratio (int): should be an integer between 0-100.
3233
3334
Returns:
3435
config_dict (dict): Matches the Configuration file JSON schema.
3536
"""
37+
if not isinstance(citation_match_ratio, int):
38+
helper_functions.vprint("Error: The given citation-match-ratio is not an integer value.")
39+
sys.exit()
40+
elif citation_match_ratio > 100 or citation_match_ratio < 0:
41+
helper_functions.vprint("Error: The given citation-match-ratio is not within the range 0-100.")
42+
sys.exit()
3643

3744
## read in config file
3845
config_dict = fileio.load_json(config_json_filepath)
@@ -86,7 +93,7 @@ def generate_internal_data_and_check_authors(config_dict):
8693

8794

8895

89-
def build_publication_dict(config_dict, prev_pubs, no_ORCID, no_GoogleScholar, no_Crossref, no_PubMed):
96+
def build_publication_dict(config_dict, prev_pubs, no_ORCID, no_GoogleScholar, no_Crossref, no_PubMed, citation_match_ratio):
9097
"""Query PubMed, ORCID, Google Scholar, and Crossref for publications for the authors.
9198
9299
Args:
@@ -96,61 +103,62 @@ def build_publication_dict(config_dict, prev_pubs, no_ORCID, no_GoogleScholar, n
96103
no_GoogleScholar (bool): if True search Google Scholar else don't.
97104
no_Crossref (bool): If True search Crossref else don't.
98105
no_PubMed (bool): If True search PubMed else don't.
106+
citation_match_ratio (int): if the fuzzy ratio between 2 citations is greater than or equal to this, then consider them to match.
99107
100108
Returns:
101-
publication_dict (dict): The dictionary matching the publication JSON schema.
102-
prev_pubs (dict): Same as input, but updated with the new publications found.
109+
running_pubs (dict): The dictionary matching the publication JSON schema.
110+
all_queries (dict): The pubs searched for each source and each author. {"PubMed":{"author1":[pub1, ...], ...}, "ORCID":{"author1":[pub1, ...], ...}, "Google Scholar":{"author1":[pub1, ...], ...}, "Crossref":{"author1":[pub1, ...], ...}}
103111
"""
104112

105113
## Get publications from PubMed
106114
helper_functions.vprint("Finding author's publications. This could take a while.")
107-
current_pubs = {}
115+
running_pubs = {}
116+
all_queries = {}
108117
if not no_PubMed:
109118
helper_functions.vprint("Searching PubMed.")
110-
PubMed_publication_dict = athr_srch_webio.search_PubMed_for_pubs(current_pubs, config_dict["Authors"], config_dict["PubMed_search"]["PubMed_email"])
111-
current_pubs.update(PubMed_publication_dict)
119+
running_pubs, PubMed_publication_dict = athr_srch_webio.search_PubMed_for_pubs(running_pubs, config_dict["Authors"], config_dict["PubMed_search"]["PubMed_email"], citation_match_ratio)
120+
all_queries["PubMed"] = PubMed_publication_dict
112121
if not no_ORCID:
113122
helper_functions.vprint("Searching ORCID.")
114-
ORCID_publication_dict = athr_srch_webio.search_ORCID_for_pubs(current_pubs, config_dict["ORCID_search"]["ORCID_key"], config_dict["ORCID_search"]["ORCID_secret"], config_dict["Authors"])
115-
current_pubs.update(ORCID_publication_dict)
123+
running_pubs, ORCID_publication_dict = athr_srch_webio.search_ORCID_for_pubs(running_pubs, config_dict["ORCID_search"]["ORCID_key"], config_dict["ORCID_search"]["ORCID_secret"], config_dict["Authors"], citation_match_ratio)
124+
all_queries["ORCID"] = ORCID_publication_dict
116125
if not no_GoogleScholar:
117126
helper_functions.vprint("Searching Google Scholar.")
118-
Google_Scholar_publication_dict = athr_srch_webio.search_Google_Scholar_for_pubs(current_pubs, config_dict["Authors"], config_dict["Crossref_search"]["mailto_email"])
119-
current_pubs.update(Google_Scholar_publication_dict)
127+
running_pubs, Google_Scholar_publication_dict = athr_srch_webio.search_Google_Scholar_for_pubs(running_pubs, config_dict["Authors"], config_dict["Crossref_search"]["mailto_email"], citation_match_ratio)
128+
all_queries["Google Scholar"] = Google_Scholar_publication_dict
120129
if not no_Crossref:
121130
helper_functions.vprint("Searching Crossref.")
122-
Crossref_publication_dict = athr_srch_webio.search_Crossref_for_pubs(current_pubs, config_dict["Authors"], config_dict["Crossref_search"]["mailto_email"])
123-
current_pubs.update(Crossref_publication_dict)
131+
running_pubs, Crossref_publication_dict = athr_srch_webio.search_Crossref_for_pubs(running_pubs, config_dict["Authors"], config_dict["Crossref_search"]["mailto_email"], citation_match_ratio)
132+
all_queries["Crossref"] = Crossref_publication_dict
124133

125-
publication_dict = {}
134+
## Do a second pass using the saved queries.
126135
if not no_PubMed:
127-
for key, value in PubMed_publication_dict.items():
128-
if not key in publication_dict:
129-
publication_dict[key] = value
136+
running_pubs, PubMed_publication_dict = athr_srch_webio.search_PubMed_for_pubs(running_pubs, config_dict["Authors"], config_dict["PubMed_search"]["PubMed_email"], citation_match_ratio, all_queries["PubMed"])
130137
if not no_ORCID:
131-
for key, value in ORCID_publication_dict.items():
132-
if not key in publication_dict:
133-
publication_dict[key] = value
138+
running_pubs, ORCID_publication_dict = athr_srch_webio.search_ORCID_for_pubs(running_pubs, config_dict["ORCID_search"]["ORCID_key"], config_dict["ORCID_search"]["ORCID_secret"], config_dict["Authors"], citation_match_ratio, all_queries["ORCID"])
134139
if not no_GoogleScholar:
135-
for key, value in Google_Scholar_publication_dict.items():
136-
if not key in publication_dict:
137-
publication_dict[key] = value
140+
running_pubs, Google_Scholar_publication_dict = athr_srch_webio.search_Google_Scholar_for_pubs(running_pubs, config_dict["Authors"], config_dict["Crossref_search"]["mailto_email"], citation_match_ratio, all_queries["Google Scholar"])
138141
if not no_Crossref:
139-
for key, value in Crossref_publication_dict.items():
140-
if not key in publication_dict:
141-
publication_dict[key] = value
142-
142+
running_pubs, Crossref_publication_dict = athr_srch_webio.search_Crossref_for_pubs(running_pubs, config_dict["Authors"], config_dict["Crossref_search"]["mailto_email"], citation_match_ratio, all_queries["Crossref"])
143+
143144
## Compare current pubs with previous and only keep those that are new or updated.
144145
for pub_id, pub_values in prev_pubs.items():
145-
if pub_id in publication_dict and not deepdiff.DeepDiff(publication_dict[pub_id], pub_values, ignore_order=True, report_repetition=True):
146-
del publication_dict[pub_id]
147-
146+
if pub_id in running_pubs and not deepdiff.DeepDiff(running_pubs[pub_id], pub_values, ignore_order=True, report_repetition=True):
147+
del running_pubs[pub_id]
148148

149-
if len(publication_dict) == 0:
149+
if len(running_pubs) == 0:
150150
helper_functions.vprint("No new publications found.")
151151
sys.exit()
152152

153-
return publication_dict
153+
## Convert PubMed articles class to dicts so they can be saved as JSON.
154+
if not no_PubMed:
155+
for author, pub_list in all_queries["PubMed"].items():
156+
new_list = []
157+
for pub in pub_list:
158+
new_list.append(helper_functions.modify_pub_dict_for_saving(pub, True))
159+
all_queries["PubMed"][author] = new_list
160+
161+
return running_pubs, all_queries
154162

155163

156164

0 commit comments

Comments
 (0)