Skip to content

Commit 86682d1

Browse files
committed
Testing and citation match improved
Some tests fixed, but mostly citation matching has been greatly improved.
1 parent 7dac06c commit 86682d1

18 files changed

+12854
-232
lines changed

src/academic_tracker/__main__.py

Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
[--no-ORCID --no_ORCID]
88
[--no-Crossref --no_Crossref]
99
[--no-PubMed --no_PubMed]
10-
[--citation-match-ratio=<0-100>]
1110
[--verbose --silent]
1211
academic_tracker reference_search <config_json_file> <references_file_or_URL> [--test]
1312
[--prev-pub=<file-path> --prev_pub=<file-path>]
@@ -16,7 +15,6 @@
1615
[--MEDLINE-reference --MEDLINE_reference]
1716
[--no-Crossref --no_Crossref]
1817
[--no-PubMed --no_PubMed]
19-
[--citation-match-ratio=<0-100>]
2018
[--verbose --silent]
2119
academic_tracker find_ORCID <config_json_file> [--verbose --silent]
2220
academic_tracker find_Google_Scholar <config_json_file> [--verbose --silent]
@@ -39,7 +37,6 @@
3937
Enter "ignore" for the <file_path> to not look for previous publications.json files in tracker directories.
4038
--prev_pub=<file-path> Deprecated. Use --prev-pub instead.
4139
--save-all-queries Save all queried results from each source in "all_results.json".
42-
--citation-match-ratio=<num> An integer from 0-100. The threshold to consider 2 citations the same using fuzzy matching. Lower is more forgiving. [default: 65]
4340
4441
Reference Type Options:
4542
--PMID-reference Indicates that the reference_file is a PMID file and only PubMed info will be returned.
@@ -112,8 +109,7 @@ def main():
112109
args["--no_PubMed"] or args["--no-PubMed"],
113110
args["--test"],
114111
args["--prev-pub"] if args["--prev-pub"] else args["--prev_pub"],
115-
args["--save-all-queries"],
116-
args["--citation-match-ratio"])
112+
args["--save-all-queries"])
117113
elif len(sys.argv) > 1 and sys.argv[1] == "reference_search":
118114
if args["--PMID_reference"] or args["--PMID-reference"]:
119115
PMID_reference(args["<config_json_file>"], args["<references_file_or_URL>"], args["--test"])
@@ -125,8 +121,7 @@ def main():
125121
args["--no_PubMed"] or args["--no-PubMed"],
126122
args["--test"],
127123
args["--prev-pub"] if args["--prev-pub"] else args["--prev_pub"],
128-
args["--save-all-queries"],
129-
args["--citation-match-ratio"])
124+
args["--save-all-queries"])
130125
elif len(sys.argv) > 1 and sys.argv[1] == "find_ORCID":
131126
find_ORCID(args["<config_json_file>"])
132127
elif len(sys.argv) > 1 and sys.argv[1] == "find_Google_Scholar":
@@ -151,7 +146,7 @@ def main():
151146

152147

153148
def author_search(config_json_filepath, no_ORCID, no_GoogleScholar, no_Crossref, no_PubMed,
154-
test, prev_pub_filepath, save_all_results, citation_match_ratio):
149+
test, prev_pub_filepath, save_all_results):
155150
"""Query sources for publications by author.
156151
157152
Reads in the JSON config file, previous publications JSON file, and checks for errors.
@@ -169,11 +164,10 @@ def author_search(config_json_filepath, no_ORCID, no_GoogleScholar, no_Crossref,
169164
test (bool): If True save_dir_name is tracker-test instead of tracker- and emails are not sent.
170165
prev_pub_filepath (str or None): filepath to the publication JSON to read in.
171166
save_all_results (bool): if True, save all of the queried publications from each source as "all_results.json"
172-
citation_match_ratio (int): if the fuzzy ratio between 2 citations is greater than or equal to this, then consider them to match.
173167
"""
174168

175169
config_dict = athr_srch_modularized.input_reading_and_checking(config_json_filepath, no_ORCID, no_GoogleScholar,
176-
no_Crossref, no_PubMed, citation_match_ratio)
170+
no_Crossref, no_PubMed)
177171

178172
## Create an authors_json for each project in the config_dict and update those authors attributes with the project attributes.
179173
authors_by_project_dict, config_dict = athr_srch_modularized.generate_internal_data_and_check_authors(config_dict)
@@ -184,7 +178,7 @@ def author_search(config_json_filepath, no_ORCID, no_GoogleScholar, no_Crossref,
184178
user_input_checking.prev_pubs_file_check(prev_pubs)
185179

186180
## Query sources and build publication_dict.
187-
publication_dict, all_queries = athr_srch_modularized.build_publication_dict(config_dict, prev_pubs, no_ORCID, no_GoogleScholar, no_Crossref, no_PubMed, citation_match_ratio)
181+
publication_dict, all_queries = athr_srch_modularized.build_publication_dict(config_dict, prev_pubs, no_ORCID, no_GoogleScholar, no_Crossref, no_PubMed)
188182

189183
save_dir_name = athr_srch_modularized.save_and_send_reports_and_emails(authors_by_project_dict, publication_dict, config_dict, test)
190184

@@ -199,7 +193,7 @@ def author_search(config_json_filepath, no_ORCID, no_GoogleScholar, no_Crossref,
199193

200194

201195
def reference_search(config_json_filepath, ref_path_or_URL, MEDLINE_reference, no_Crossref, no_PubMed,
202-
test, prev_pub_filepath, save_all_results, citation_match_ratio):
196+
test, prev_pub_filepath, save_all_results):
203197
"""Query PubMed and Crossref for publications matching a reference.
204198
205199
Read in user inputs and check for error, query sources based on inputs, build
@@ -214,12 +208,14 @@ def reference_search(config_json_filepath, ref_path_or_URL, MEDLINE_reference, n
214208
test (bool): If True save_dir_name is tracker-test instead of tracker- and emails are not sent.
215209
prev_pub_filepath (str or None): filepath to the publication JSON to read in.
216210
save_all_results (bool): if True, save all of the queried publications from each source as "all_results.json"
217-
citation_match_ratio (int): if the fuzzy ratio between 2 citations is greater than or equal to this, then consider them to match.
218211
"""
219212

220-
config_dict, tokenized_citations, has_previous_pubs, prev_pubs = ref_srch_modularized.input_reading_and_checking(config_json_filepath, ref_path_or_URL, MEDLINE_reference, no_Crossref, no_PubMed, prev_pub_filepath, citation_match_ratio)
213+
config_dict, tokenized_citations, has_previous_pubs, prev_pubs = \
214+
ref_srch_modularized.input_reading_and_checking(config_json_filepath, ref_path_or_URL,
215+
MEDLINE_reference, no_Crossref, no_PubMed,
216+
prev_pub_filepath)
221217

222-
publication_dict, tokenized_citations, all_queries = ref_srch_modularized.build_publication_dict(config_dict, tokenized_citations, no_Crossref, no_PubMed, citation_match_ratio)
218+
publication_dict, tokenized_citations, all_queries = ref_srch_modularized.build_publication_dict(config_dict, tokenized_citations, no_Crossref, no_PubMed)
223219

224220
save_dir_name = ref_srch_modularized.save_and_send_reports_and_emails(config_dict, tokenized_citations, publication_dict, prev_pubs, has_previous_pubs, test)
225221

src/academic_tracker/athr_srch_modularized.py

Lines changed: 11 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from . import webio
2121

2222

23-
def input_reading_and_checking(config_json_filepath, no_ORCID, no_GoogleScholar, no_Crossref, no_PubMed, citation_match_ratio):
23+
def input_reading_and_checking(config_json_filepath, no_ORCID, no_GoogleScholar, no_Crossref, no_PubMed):
2424
"""Read in inputs from user and do error checking.
2525
2626
Args:
@@ -29,18 +29,10 @@ def input_reading_and_checking(config_json_filepath, no_ORCID, no_GoogleScholar,
2929
no_GoogleScholar (bool): if True search Google Scholar else don't. Reduces checking on config JSON if True.
3030
no_Crossref (bool): If True search Crossref else don't. Reduces checking on config JSON if True.
3131
no_PubMed (bool): If True search PubMed else don't. Reduces checking on config JSON if True.
32-
citation_match_ratio (int): should be an integer between 0-100.
3332
3433
Returns:
3534
config_dict (dict): Matches the Configuration file JSON schema.
36-
"""
37-
if not isinstance(citation_match_ratio, int):
38-
helper_functions.vprint("Error: The given citation-match-ratio is not an integer value.")
39-
sys.exit()
40-
elif citation_match_ratio > 100 or citation_match_ratio < 0:
41-
helper_functions.vprint("Error: The given citation-match-ratio is not within the range 0-100.")
42-
sys.exit()
43-
35+
"""
4436
## read in config file
4537
config_dict = fileio.load_json(config_json_filepath)
4638

@@ -93,7 +85,7 @@ def generate_internal_data_and_check_authors(config_dict):
9385

9486

9587

96-
def build_publication_dict(config_dict, prev_pubs, no_ORCID, no_GoogleScholar, no_Crossref, no_PubMed, citation_match_ratio):
88+
def build_publication_dict(config_dict, prev_pubs, no_ORCID, no_GoogleScholar, no_Crossref, no_PubMed):
9789
"""Query PubMed, ORCID, Google Scholar, and Crossref for publications for the authors.
9890
9991
Args:
@@ -103,7 +95,6 @@ def build_publication_dict(config_dict, prev_pubs, no_ORCID, no_GoogleScholar, n
10395
no_GoogleScholar (bool): if True search Google Scholar else don't.
10496
no_Crossref (bool): If True search Crossref else don't.
10597
no_PubMed (bool): If True search PubMed else don't.
106-
citation_match_ratio (int): if the fuzzy ratio between 2 citations is greater than or equal to this, then consider them to match.
10798
10899
Returns:
109100
running_pubs (dict): The dictionary matching the publication JSON schema.
@@ -116,30 +107,30 @@ def build_publication_dict(config_dict, prev_pubs, no_ORCID, no_GoogleScholar, n
116107
all_queries = {}
117108
if not no_PubMed:
118109
helper_functions.vprint("Searching PubMed.")
119-
running_pubs, PubMed_publication_dict = athr_srch_webio.search_PubMed_for_pubs(running_pubs, config_dict["Authors"], config_dict["PubMed_search"]["PubMed_email"], citation_match_ratio)
110+
running_pubs, PubMed_publication_dict = athr_srch_webio.search_PubMed_for_pubs(running_pubs, config_dict["Authors"], config_dict["PubMed_search"]["PubMed_email"])
120111
all_queries["PubMed"] = PubMed_publication_dict
121112
if not no_ORCID:
122113
helper_functions.vprint("Searching ORCID.")
123-
running_pubs, ORCID_publication_dict = athr_srch_webio.search_ORCID_for_pubs(running_pubs, config_dict["ORCID_search"]["ORCID_key"], config_dict["ORCID_search"]["ORCID_secret"], config_dict["Authors"], citation_match_ratio)
114+
running_pubs, ORCID_publication_dict = athr_srch_webio.search_ORCID_for_pubs(running_pubs, config_dict["ORCID_search"]["ORCID_key"], config_dict["ORCID_search"]["ORCID_secret"], config_dict["Authors"])
124115
all_queries["ORCID"] = ORCID_publication_dict
125116
if not no_GoogleScholar:
126117
helper_functions.vprint("Searching Google Scholar.")
127-
running_pubs, Google_Scholar_publication_dict = athr_srch_webio.search_Google_Scholar_for_pubs(running_pubs, config_dict["Authors"], config_dict["Crossref_search"]["mailto_email"], citation_match_ratio)
118+
running_pubs, Google_Scholar_publication_dict = athr_srch_webio.search_Google_Scholar_for_pubs(running_pubs, config_dict["Authors"], config_dict["Crossref_search"]["mailto_email"])
128119
all_queries["Google Scholar"] = Google_Scholar_publication_dict
129120
if not no_Crossref:
130121
helper_functions.vprint("Searching Crossref.")
131-
running_pubs, Crossref_publication_dict = athr_srch_webio.search_Crossref_for_pubs(running_pubs, config_dict["Authors"], config_dict["Crossref_search"]["mailto_email"], citation_match_ratio)
122+
running_pubs, Crossref_publication_dict = athr_srch_webio.search_Crossref_for_pubs(running_pubs, config_dict["Authors"], config_dict["Crossref_search"]["mailto_email"])
132123
all_queries["Crossref"] = Crossref_publication_dict
133124

134125
## Do a second pass using the saved queries.
135126
if not no_PubMed:
136-
running_pubs, PubMed_publication_dict = athr_srch_webio.search_PubMed_for_pubs(running_pubs, config_dict["Authors"], config_dict["PubMed_search"]["PubMed_email"], citation_match_ratio, all_queries["PubMed"])
127+
running_pubs, PubMed_publication_dict = athr_srch_webio.search_PubMed_for_pubs(running_pubs, config_dict["Authors"], config_dict["PubMed_search"]["PubMed_email"], all_queries["PubMed"])
137128
if not no_ORCID:
138-
running_pubs, ORCID_publication_dict = athr_srch_webio.search_ORCID_for_pubs(running_pubs, config_dict["ORCID_search"]["ORCID_key"], config_dict["ORCID_search"]["ORCID_secret"], config_dict["Authors"], citation_match_ratio, all_queries["ORCID"])
129+
running_pubs, ORCID_publication_dict = athr_srch_webio.search_ORCID_for_pubs(running_pubs, config_dict["ORCID_search"]["ORCID_key"], config_dict["ORCID_search"]["ORCID_secret"], config_dict["Authors"], all_queries["ORCID"])
139130
if not no_GoogleScholar:
140-
running_pubs, Google_Scholar_publication_dict = athr_srch_webio.search_Google_Scholar_for_pubs(running_pubs, config_dict["Authors"], config_dict["Crossref_search"]["mailto_email"], citation_match_ratio, all_queries["Google Scholar"])
131+
running_pubs, Google_Scholar_publication_dict = athr_srch_webio.search_Google_Scholar_for_pubs(running_pubs, config_dict["Authors"], config_dict["Crossref_search"]["mailto_email"], all_queries["Google Scholar"])
141132
if not no_Crossref:
142-
running_pubs, Crossref_publication_dict = athr_srch_webio.search_Crossref_for_pubs(running_pubs, config_dict["Authors"], config_dict["Crossref_search"]["mailto_email"], citation_match_ratio, all_queries["Crossref"])
133+
running_pubs, Crossref_publication_dict = athr_srch_webio.search_Crossref_for_pubs(running_pubs, config_dict["Authors"], config_dict["Crossref_search"]["mailto_email"], all_queries["Crossref"])
143134

144135
## Compare current pubs with previous and only keep those that are new or updated.
145136
for pub_id, pub_values in prev_pubs.items():

0 commit comments

Comments
 (0)