Skip to content

Commit 58734ff

Browse files
committed
Many tests fixed and code refactors.
1 parent 86682d1 commit 58734ff

File tree

148 files changed

+39990
-20102
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

148 files changed

+39990
-20102
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,5 @@ coverage.xml
1313
.coverage
1414
htmlcov/
1515
README_old.rst
16-
testing_scratch/
16+
testing_scratch/
17+
tests/testing_files/new_intermediate_results/

docs/reporting.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,12 +164,15 @@ Keywords
164164
<last_author>
165165
<authors> Will be replaced with a comma separated list of author names of all authors.
166166
<grants> Will be replaced with a comma separated list of grants associated with the publication.
167+
<queried_sources> Will be replaced with a comma separated list of the sources where information was found for the publication.
167168
168169
Pub Author Keywords - Pulled from the authors section of each publication in the publications.json file.
169170
<pub_author_first>
170171
<pub_author_last>
171172
<pub_author_initials>
172173
<pub_author_affiliations>
174+
<pub_author_ORCID>
175+
<pub_author_id>
173176
174177
Author Keywords - Pulled from the Authors section of the configuration JSON file.
175178
<author_first>
@@ -377,6 +380,8 @@ Keywords
377380
<pub_author_last> - Collaborator's last name.
378381
<pub_author_initials> - Collaborator's initials.
379382
<pub_author_affiliations> - Collaborator's affiliations.
383+
<pub_author_ORCID> - Collaborator's ORCID.
384+
<pub_author_id> - Collaborator's ID.
380385
381386
382387
Examples

src/academic_tracker/__main__.py

Lines changed: 74 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,18 +13,21 @@
1313
[--save-all-queries]
1414
[--PMID-reference --PMID_reference]
1515
[--MEDLINE-reference --MEDLINE_reference]
16+
[--keep-duplicates]
1617
[--no-Crossref --no_Crossref]
1718
[--no-PubMed --no_PubMed]
1819
[--verbose --silent]
1920
academic_tracker find_ORCID <config_json_file> [--verbose --silent]
2021
academic_tracker find_Google_Scholar <config_json_file> [--verbose --silent]
2122
academic_tracker add_authors <config_json_file> <authors_file> [--verbose --silent]
2223
academic_tracker tokenize_reference <references_file_or_URL> [--MEDLINE-reference --MEDLINE_reference]
24+
[--keep-duplicates]
2325
[--verbose --silent]
2426
academic_tracker gen_reports_and_emails_auth <config_json_file> <publication_json_file> [--test --verbose --silent]
2527
academic_tracker gen_reports_and_emails_ref <config_json_file> <references_file_or_URL> <publication_json_file> [--test]
2628
[--prev-pub=<file-path> --prev_pub=<file-path>]
2729
[--MEDLINE-reference --MEDLINE_reference]
30+
[--keep-duplicates]
2831
[--verbose --silent]
2932
3033
Options:
@@ -37,6 +40,7 @@
3740
Enter "ignore" for the <file_path> to not look for previous publications.json files in tracker directories.
3841
--prev_pub=<file-path> Deprecated. Use --prev-pub instead.
3942
--save-all-queries Save all queried results from each source in "all_results.json".
43+
--keep-duplicates After references are tokenized duplicate entries are removed, use this option not to remove duplicate entries.
4044
4145
Reference Type Options:
4246
--PMID-reference Indicates that the reference_file is a PMID file and only PubMed info will be returned.
@@ -82,6 +86,11 @@
8286
VERBOSE = True
8387
SILENT = False
8488

89+
## TODO a
90+
## Make sure in documentation that author affiliation is said to be a newline separated list, was comma, but had to change to match PubMed.
91+
## Change ref and author search to be aware of collective authors, tokenized citations needs to change.
92+
## In the tests for reporting, are the tests using a version of the publication dict that has author affiliations separated with newlines?
93+
8594
def main():
8695

8796
## Have to modify the doc string so docopt can recognize more options than what is written.
@@ -121,7 +130,8 @@ def main():
121130
args["--no_PubMed"] or args["--no-PubMed"],
122131
args["--test"],
123132
args["--prev-pub"] if args["--prev-pub"] else args["--prev_pub"],
124-
args["--save-all-queries"])
133+
args["--save-all-queries"],
134+
not args["--keep-duplicates"])
125135
elif len(sys.argv) > 1 and sys.argv[1] == "find_ORCID":
126136
find_ORCID(args["<config_json_file>"])
127137
elif len(sys.argv) > 1 and sys.argv[1] == "find_Google_Scholar":
@@ -130,7 +140,8 @@ def main():
130140
add_authors(args["<config_json_file>"], args["<authors_file>"])
131141
elif len(sys.argv) > 1 and sys.argv[1] == "tokenize_reference":
132142
tokenize_reference(args["<references_file_or_URL>"],
133-
args["--MEDLINE_reference"] or args["--MEDLINE-reference"])
143+
args["--MEDLINE_reference"] or args["--MEDLINE-reference"],
144+
not args["--keep-duplicates"])
134145
elif len(sys.argv) > 1 and sys.argv[1] == "gen_reports_and_emails_auth":
135146
gen_reports_and_emails_auth(args["<config_json_file>"], args["<publication_json_file>"], args["--test"])
136147
elif len(sys.argv) > 1 and sys.argv[1] == "gen_reports_and_emails_ref":
@@ -139,7 +150,8 @@ def main():
139150
args["<publication_json_file>"],
140151
args["--MEDLINE_reference"] or args["--MEDLINE-reference"],
141152
args["--test"],
142-
args["--prev-pub"] if args["--prev-pub"] else args["--prev_pub"])
153+
args["--prev-pub"] if args["--prev-pub"] else args["--prev_pub"],
154+
not args["--keep-duplicates"])
143155
else:
144156
print("Unrecognized command")
145157

@@ -193,7 +205,7 @@ def author_search(config_json_filepath, no_ORCID, no_GoogleScholar, no_Crossref,
193205

194206

195207
def reference_search(config_json_filepath, ref_path_or_URL, MEDLINE_reference, no_Crossref, no_PubMed,
196-
test, prev_pub_filepath, save_all_results):
208+
test, prev_pub_filepath, save_all_results, remove_duplicates):
197209
"""Query PubMed and Crossref for publications matching a reference.
198210
199211
Read in user inputs and check for error, query sources based on inputs, build
@@ -207,13 +219,15 @@ def reference_search(config_json_filepath, ref_path_or_URL, MEDLINE_reference, n
207219
no_PubMed (bool): If True search PubMed else don't. Reduces checking on config JSON if True.
208220
test (bool): If True save_dir_name is tracker-test instead of tracker- and emails are not sent.
209221
prev_pub_filepath (str or None): filepath to the publication JSON to read in.
210-
save_all_results (bool): if True, save all of the queried publications from each source as "all_results.json"
222+
save_all_results (bool): if True, save all of the queried publications from each source as "all_results.json".
223+
remove_duplicates (bool): if True, remove duplicate entries in tokenized citations.
211224
"""
212225

213226
config_dict, tokenized_citations, has_previous_pubs, prev_pubs = \
214227
ref_srch_modularized.input_reading_and_checking(config_json_filepath, ref_path_or_URL,
215228
MEDLINE_reference, no_Crossref, no_PubMed,
216-
prev_pub_filepath)
229+
prev_pub_filepath,
230+
remove_duplicates)
217231

218232
publication_dict, tokenized_citations, all_queries = ref_srch_modularized.build_publication_dict(config_dict, tokenized_citations, no_Crossref, no_PubMed)
219233

@@ -374,13 +388,53 @@ def add_authors(config_json_filepath, authors_filepath):
374388
if missing_values:
375389
helper_functions.vprint("Error: The following columns have null values:\n" + "\n".join(missing_values))
376390
sys.exit()
391+
392+
393+
if "first_name" in df.columns and not "last_name" in df.columns:
394+
helper_functions.vprint("Error: There is a 'first_name' column without a matching 'last_name' column.")
395+
sys.exit()
396+
397+
if "last_name" in df.columns and not "first_name" in df.columns:
398+
helper_functions.vprint("Error: There is a 'last_name' column without a matching 'first_name' column.")
399+
sys.exit()
400+
401+
if not "last_name" in df.columns and not "first_name" in df.columns and not "collective_name" in df.columns:
402+
helper_functions.vprint("Error: There must be either a 'collective_name' column or 'first_name' and 'last_name' columns.")
403+
sys.exit()
404+
405+
406+
if not "collective_name" in df.columns:
407+
missing_first_or_last_names = df.loc[:, ["first_name", "last_name"]].isnull().any(axis=1)
408+
missing_names_indexes = missing_first_or_last_names[missing_first_or_last_names==True].index.values
409+
else:
410+
missing_collective_names = df.loc[:, "collective_name"].isnull()
411+
if "first_name" in df.columns and "last_name" in df.columns:
412+
missing_first_or_last_names = df.loc[:, ["first_name", "last_name"]].isnull().any(axis=1)
413+
missing_names = missing_collective_names & missing_first_or_last_names
414+
missing_names_indexes = missing_names[missing_names==True].index.values
415+
else:
416+
missing_names_indexes = missing_collective_names[missing_collective_names==True].index.values
417+
418+
if len(missing_names_indexes) > 0:
419+
message = ("Error: The following rows have incomplete name columns:\n" +
420+
"\n".join([str(index+1) for index in missing_names_indexes]) +
421+
"\nEach row must have values in either the 'collective_name' column "
422+
"or the 'first_name' and 'last_name' columns.")
423+
helper_functions.vprint(message)
424+
sys.exit()
425+
426+
377427

378428
for column in required_columns:
379429
df.loc[:, column] = df.loc[:, column].astype(str)
380430

381431
## Assuming all list types are string lists.
382432
author_keys = tracker_schema.config_schema["properties"]["Authors"]["additionalProperties"]["properties"]
383433
list_type_keys = [key for key in author_keys if "type" in author_keys[key] and author_keys[key]["type"] == "array"]
434+
author_keys = tracker_schema.config_schema["properties"]["Authors"]["additionalProperties"]["then"]["properties"]
435+
list_type_keys += [key for key in author_keys if "type" in author_keys[key] and author_keys[key]["type"] == "array"]
436+
author_keys = tracker_schema.config_schema["properties"]["Authors"]["additionalProperties"]["else"]["properties"]
437+
list_type_keys += [key for key in author_keys if "type" in author_keys[key] and author_keys[key]["type"] == "array"]
384438
for key in list_type_keys:
385439
if key in df.columns:
386440
df.loc[:, key] = df.loc[:, key].astype(str)
@@ -394,19 +448,21 @@ def add_authors(config_json_filepath, authors_filepath):
394448
save_dir_name = "tracker-" + re.sub(r"\-| |\:", "", str(datetime.datetime.now())[2:16])
395449
os.mkdir(save_dir_name)
396450

397-
fileio.save_json_to_file(save_dir_name, "configuration.json", config_dict)
451+
fileio.save_json_to_file(save_dir_name, "configuration.json", config_dict, False)
398452
helper_functions.vprint("Success! configuration.json saved in " + save_dir_name)
399453

400454

401455

402-
def tokenize_reference(ref_path_or_URL, MEDLINE_reference):
456+
def tokenize_reference(ref_path_or_URL, MEDLINE_reference, remove_duplicates):
403457
"""Tokenize input reference file.
404458
405459
Args:
406460
ref_path_or_URL (str): either a filepath to file to tokenize or a URL to tokenize.
461+
MEDLINE_reference (bool): True indicates that ref_path_or_URL is in MEDLINE format.
462+
remove_duplicates (bool): if True, remove duplicate entries in tokenized citations.
407463
"""
408464

409-
tokenized_citations = ref_srch_webio.tokenize_reference_input(ref_path_or_URL, MEDLINE_reference)
465+
tokenized_citations = ref_srch_webio.tokenize_reference_input(ref_path_or_URL, MEDLINE_reference, remove_duplicates)
410466

411467
report_string = ref_srch_emails_and_reports.create_tokenization_report(tokenized_citations)
412468

@@ -451,7 +507,13 @@ def gen_reports_and_emails_auth(config_json_filepath, publication_json_filepath,
451507

452508

453509

454-
def gen_reports_and_emails_ref(config_json_filepath, ref_path_or_URL, publication_json_filepath, MEDLINE_reference, test, prev_pub_filepath):
510+
def gen_reports_and_emails_ref(config_json_filepath,
511+
ref_path_or_URL,
512+
publication_json_filepath,
513+
MEDLINE_reference,
514+
test,
515+
prev_pub_filepath,
516+
remove_duplicates):
455517
"""Generate reports and emails for input publications and reference as if reference_search was ran.
456518
457519
Args:
@@ -461,6 +523,7 @@ def gen_reports_and_emails_ref(config_json_filepath, ref_path_or_URL, publicatio
461523
MEDLINE_reference (bool): If True re_path_or_URL is a filepath to a MEDLINE formatted file.
462524
test (bool): If True save_dir_name is tracker-test instead of tracker- and emails are not sent.
463525
prev_pub_filepath (str or None): filepath to the publication JSON to read in.
526+
remove_duplicates (bool): if True, remove duplicate entries in tokenized citations.
464527
"""
465528

466529
## read in config file
@@ -480,7 +543,7 @@ def gen_reports_and_emails_ref(config_json_filepath, ref_path_or_URL, publicatio
480543
if has_previous_pubs:
481544
user_input_checking.prev_pubs_file_check(prev_pubs)
482545

483-
tokenized_citations = ref_srch_webio.tokenize_reference_input(ref_path_or_URL, MEDLINE_reference)
546+
tokenized_citations = ref_srch_webio.tokenize_reference_input(ref_path_or_URL, MEDLINE_reference, remove_duplicates)
484547
## Read in publications.json
485548
publication_dict = fileio.load_json(publication_json_filepath)
486549
user_input_checking.prev_pubs_file_check(publication_dict)

0 commit comments

Comments
 (0)