MoseleyBioinformaticsLab
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/todo.rst‎
Lines changed: 11 additions & 1 deletion b/‎docs/todo.rst‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎src/academic_tracker/__main__.py‎
Lines changed: 31 additions & 8 deletions b/‎src/academic_tracker/__main__.py‎
Lines changed: 31 additions & 8 deletions
diff --git a/‎src/academic_tracker/athr_srch_modularized.py‎
Lines changed: 40 additions & 32 deletions b/‎src/academic_tracker/athr_srch_modularized.py‎
Lines changed: 40 additions & 32 deletions
@@ -12,4 +12,5 @@ dist/
 coverage.xml
 .coverage
 htmlcov/
-README_old.rst
+README_old.rst
+testing_scratch/
@@ -16,4 +16,14 @@ Add PMCID and grants to pymed package.
 
 Add expanded search to orcid package or look for more up to date package to use. Expanded search was added to ORCID's API with 3.0 release. orcid package appears to be 2.0 only.
 
-Add capability to get the citations each paper cites.
+Add capability to get the citations each paper cites.
+
+Switch to a merge style from each source, so try to fill in information that wasn't found previously. 
+Keep the queries from each source, and do 2 passes with the new merge logic. This makes it so that if 
+a publication was on PubMed, but an author couldn't be matched, but an author was matched at another 
+source we can merge the information with the second pass. Would need to change the logic to first look 
+and see if the publication is in the list already and if it is then we don't need to make an author match 
+because an author match was made from another source. The big changes are to keep the queries, do a second 
+pass, merge information, and use existence in the list as verification in addition to an author match.
+  
+Save references out in "citation" format. Look at formats Google Scholar offers, example EndNote.
@@ -2,17 +2,21 @@
 Usage:
     academic_tracker author_search <config_json_file> [--test] 
                                                       [--prev_pub=<file-path> --prev-pub=<file-path>] 
+                                                      [--save-all-queries]
                                                       [--no-GoogleScholar --no_GoogleScholar] 
                                                       [--no-ORCID --no_ORCID] 
                                                       [--no-Crossref --no_Crossref] 
                                                       [--no-PubMed --no_PubMed]
+                                                      [--citation-match-ratio=<0-100>]
                                                       [--verbose --silent]
     academic_tracker reference_search <config_json_file> <references_file_or_URL> [--test] 
                                                                                   [--prev-pub=<file-path> --prev_pub=<file-path>]
+                                                                                  [--save-all-queries]
                                                                                   [--PMID-reference --PMID_reference]
                                                                                   [--MEDLINE-reference --MEDLINE_reference]
                                                                                   [--no-Crossref --no_Crossref]
                                                                                   [--no-PubMed --no_PubMed]
+                                                                                  [--citation-match-ratio=<0-100>]
                                                                                   [--verbose --silent]
     academic_tracker find_ORCID <config_json_file> [--verbose --silent]
     academic_tracker find_Google_Scholar <config_json_file> [--verbose --silent]
@@ -34,6 +38,8 @@
     --prev-pub=<file-path>            Filepath to json or csv with publication ids to ignore. 
                                       Enter "ignore" for the <file_path> to not look for previous publications.json files in tracker directories.
     --prev_pub=<file-path>            Deprecated. Use --prev-pub instead.
+    --save-all-queries                Save all queried results from each source in "all_results.json".
+    --citation-match-ratio=<num>      An integer from 0-100. The threshold to consider 2 citations the same using fuzzy matching. Lower is more forgiving. [default: 65]
     
 Reference Type Options:    
     --PMID-reference                  Indicates that the reference_file is a PMID file and only PubMed info will be returned.
@@ -105,7 +111,9 @@ def main():
                       args["--no_Crossref"] or args["--no-Crossref"],
                       args["--no_PubMed"] or args["--no-PubMed"],
                       args["--test"], 
-                      args["--prev-pub"] if args["--prev-pub"] else args["--prev_pub"])
+                      args["--prev-pub"] if args["--prev-pub"] else args["--prev_pub"],
+                      args["--save-all-queries"],
+                      args["--citation-match-ratio"])
     elif len(sys.argv) > 1 and sys.argv[1] == "reference_search":
         if args["--PMID_reference"] or args["--PMID-reference"]:
             PMID_reference(args["<config_json_file>"], args["<references_file_or_URL>"], args["--test"])
@@ -116,7 +124,9 @@ def main():
                              args["--no_Crossref"] or args["--no-Crossref"], 
                              args["--no_PubMed"] or args["--no-PubMed"],
                              args["--test"], 
-                             args["--prev-pub"] if args["--prev-pub"] else args["--prev_pub"])
+                             args["--prev-pub"] if args["--prev-pub"] else args["--prev_pub"],
+                             args["--save-all-queries"],
+                             args["--citation-match-ratio"])
     elif len(sys.argv) > 1 and sys.argv[1] == "find_ORCID":
         find_ORCID(args["<config_json_file>"])
     elif len(sys.argv) > 1 and sys.argv[1] == "find_Google_Scholar":
@@ -140,7 +150,8 @@ def main():
 
 
 
-def author_search(config_json_filepath, no_ORCID, no_GoogleScholar, no_Crossref, no_PubMed, test, prev_pub_filepath):
+def author_search(config_json_filepath, no_ORCID, no_GoogleScholar, no_Crossref, no_PubMed, 
+                  test, prev_pub_filepath, save_all_results, citation_match_ratio):
     """Query sources for publications by author.
     
     Reads in the JSON config file, previous publications JSON file, and checks for errors.
@@ -157,9 +168,12 @@ def author_search(config_json_filepath, no_ORCID, no_GoogleScholar, no_Crossref,
         no_PubMed (bool): If True search PubMed else don't. Reduces checking on config JSON if True.
         test (bool): If True save_dir_name is tracker-test instead of tracker- and emails are not sent.
         prev_pub_filepath (str or None): filepath to the publication JSON to read in.
+        save_all_results (bool): if True, save all of the queried publications from each source as "all_results.json"
+        citation_match_ratio (int): if the fuzzy ratio between 2 citations is greater than or equal to this, then consider them to match.
     """
 
-    config_dict = athr_srch_modularized.input_reading_and_checking(config_json_filepath, no_ORCID, no_GoogleScholar, no_Crossref, no_PubMed)
+    config_dict = athr_srch_modularized.input_reading_and_checking(config_json_filepath, no_ORCID, no_GoogleScholar, 
+                                                                   no_Crossref, no_PubMed, citation_match_ratio)
 
     ## Create an authors_json for each project in the config_dict and update those authors attributes with the project attributes.
     authors_by_project_dict, config_dict = athr_srch_modularized.generate_internal_data_and_check_authors(config_dict)
@@ -170,18 +184,22 @@ def author_search(config_json_filepath, no_ORCID, no_GoogleScholar, no_Crossref,
         user_input_checking.prev_pubs_file_check(prev_pubs)
 
     ## Query sources and build publication_dict.
-    publication_dict = athr_srch_modularized.build_publication_dict(config_dict, prev_pubs, no_ORCID, no_GoogleScholar, no_Crossref, no_PubMed)            
+    publication_dict, all_queries = athr_srch_modularized.build_publication_dict(config_dict, prev_pubs, no_ORCID, no_GoogleScholar, no_Crossref, no_PubMed, citation_match_ratio)            
 
     save_dir_name = athr_srch_modularized.save_and_send_reports_and_emails(authors_by_project_dict, publication_dict, config_dict, test)
 
     ## combine previous and new publications lists and save
     fileio.save_publications_to_file(save_dir_name, publication_dict, prev_pubs)
 
+    if save_all_results:
+        fileio.save_json_to_file(save_dir_name, "all_results.json", all_queries)
+    
     helper_functions.vprint("Success. Publications, reports, and emails saved in " + save_dir_name)
 
 
 
-def reference_search(config_json_filepath, ref_path_or_URL, MEDLINE_reference, no_Crossref, no_PubMed, test, prev_pub_filepath):
+def reference_search(config_json_filepath, ref_path_or_URL, MEDLINE_reference, no_Crossref, no_PubMed, 
+                     test, prev_pub_filepath, save_all_results, citation_match_ratio):
     """Query PubMed and Crossref for publications matching a reference.
     
     Read in user inputs and check for error, query sources based on inputs, build 
@@ -195,17 +213,22 @@ def reference_search(config_json_filepath, ref_path_or_URL, MEDLINE_reference, n
         no_PubMed (bool): If True search PubMed else don't. Reduces checking on config JSON if True.
         test (bool): If True save_dir_name is tracker-test instead of tracker- and emails are not sent.
         prev_pub_filepath (str or None): filepath to the publication JSON to read in.
+        save_all_results (bool): if True, save all of the queried publications from each source as "all_results.json"
+        citation_match_ratio (int): if the fuzzy ratio between 2 citations is greater than or equal to this, then consider them to match.
     """
 
-    config_dict, tokenized_citations, has_previous_pubs, prev_pubs = ref_srch_modularized.input_reading_and_checking(config_json_filepath, ref_path_or_URL, MEDLINE_reference, no_Crossref, no_PubMed, prev_pub_filepath)       
+    config_dict, tokenized_citations, has_previous_pubs, prev_pubs = ref_srch_modularized.input_reading_and_checking(config_json_filepath, ref_path_or_URL, MEDLINE_reference, no_Crossref, no_PubMed, prev_pub_filepath, citation_match_ratio)       
 
-    publication_dict, tokenized_citations = ref_srch_modularized.build_publication_dict(config_dict, tokenized_citations, no_Crossref, no_PubMed)
+    publication_dict, tokenized_citations, all_queries = ref_srch_modularized.build_publication_dict(config_dict, tokenized_citations, no_Crossref, no_PubMed, citation_match_ratio)
 
     save_dir_name = ref_srch_modularized.save_and_send_reports_and_emails(config_dict, tokenized_citations, publication_dict, prev_pubs, has_previous_pubs, test)
 
     fileio.save_publications_to_file(save_dir_name, publication_dict, {})
     fileio.save_json_to_file(save_dir_name, "tokenized_reference.json", tokenized_citations)
 
+    if save_all_results:
+        fileio.save_json_to_file(save_dir_name, "all_results.json", all_queries)
+    
     helper_functions.vprint("Success. Publications and reports saved in " + save_dir_name)
 
 
 
@@ -20,7 +20,7 @@
 from . import webio
 
 
-def input_reading_and_checking(config_json_filepath, no_ORCID, no_GoogleScholar, no_Crossref, no_PubMed):
+def input_reading_and_checking(config_json_filepath, no_ORCID, no_GoogleScholar, no_Crossref, no_PubMed, citation_match_ratio):
     """Read in inputs from user and do error checking.
     
     Args:
@@ -29,10 +29,17 @@ def input_reading_and_checking(config_json_filepath, no_ORCID, no_GoogleScholar,
         no_GoogleScholar (bool): if True search Google Scholar else don't. Reduces checking on config JSON if True.
         no_Crossref (bool): If True search Crossref else don't. Reduces checking on config JSON if True.
         no_PubMed (bool): If True search PubMed else don't. Reduces checking on config JSON if True.
+        citation_match_ratio (int): should be an integer between 0-100.
         
     Returns:
         config_dict (dict): Matches the Configuration file JSON schema.
     """
+    if not isinstance(citation_match_ratio, int):
+        helper_functions.vprint("Error: The given citation-match-ratio is not an integer value.")
+        sys.exit()
+    elif citation_match_ratio > 100 or citation_match_ratio < 0:
+        helper_functions.vprint("Error: The given citation-match-ratio is not within the range 0-100.")
+        sys.exit()
 
     ## read in config file
     config_dict = fileio.load_json(config_json_filepath)
@@ -86,7 +93,7 @@ def generate_internal_data_and_check_authors(config_dict):
 
 
 
-def build_publication_dict(config_dict, prev_pubs, no_ORCID, no_GoogleScholar, no_Crossref, no_PubMed):
+def build_publication_dict(config_dict, prev_pubs, no_ORCID, no_GoogleScholar, no_Crossref, no_PubMed, citation_match_ratio):
     """Query PubMed, ORCID, Google Scholar, and Crossref for publications for the authors.
     
     Args:
@@ -96,61 +103,62 @@ def build_publication_dict(config_dict, prev_pubs, no_ORCID, no_GoogleScholar, n
         no_GoogleScholar (bool): if True search Google Scholar else don't.
         no_Crossref (bool): If True search Crossref else don't.
         no_PubMed (bool): If True search PubMed else don't.
+        citation_match_ratio (int): if the fuzzy ratio between 2 citations is greater than or equal to this, then consider them to match.
         
     Returns:
-        publication_dict (dict): The dictionary matching the publication JSON schema.
-        prev_pubs (dict): Same as input, but updated with the new publications found.
+        running_pubs (dict): The dictionary matching the publication JSON schema.
+        all_queries (dict): The pubs searched for each source and each author. {"PubMed":{"author1":[pub1, ...], ...}, "ORCID":{"author1":[pub1, ...], ...}, "Google Scholar":{"author1":[pub1, ...], ...}, "Crossref":{"author1":[pub1, ...], ...}}
     """
 
     ## Get publications from PubMed 
     helper_functions.vprint("Finding author's publications. This could take a while.")
-    current_pubs = {}
+    running_pubs = {}
+    all_queries = {}
     if not no_PubMed:
         helper_functions.vprint("Searching PubMed.")
-        PubMed_publication_dict = athr_srch_webio.search_PubMed_for_pubs(current_pubs, config_dict["Authors"], config_dict["PubMed_search"]["PubMed_email"])
-        current_pubs.update(PubMed_publication_dict)
+        running_pubs, PubMed_publication_dict = athr_srch_webio.search_PubMed_for_pubs(running_pubs, config_dict["Authors"], config_dict["PubMed_search"]["PubMed_email"], citation_match_ratio)
+        all_queries["PubMed"] = PubMed_publication_dict
     if not no_ORCID:
         helper_functions.vprint("Searching ORCID.")
-        ORCID_publication_dict = athr_srch_webio.search_ORCID_for_pubs(current_pubs, config_dict["ORCID_search"]["ORCID_key"], config_dict["ORCID_search"]["ORCID_secret"], config_dict["Authors"])
-        current_pubs.update(ORCID_publication_dict)
+        running_pubs, ORCID_publication_dict = athr_srch_webio.search_ORCID_for_pubs(running_pubs, config_dict["ORCID_search"]["ORCID_key"], config_dict["ORCID_search"]["ORCID_secret"], config_dict["Authors"], citation_match_ratio)
+        all_queries["ORCID"] = ORCID_publication_dict
     if not no_GoogleScholar:
         helper_functions.vprint("Searching Google Scholar.")
-        Google_Scholar_publication_dict = athr_srch_webio.search_Google_Scholar_for_pubs(current_pubs, config_dict["Authors"], config_dict["Crossref_search"]["mailto_email"])
-        current_pubs.update(Google_Scholar_publication_dict)
+        running_pubs, Google_Scholar_publication_dict = athr_srch_webio.search_Google_Scholar_for_pubs(running_pubs, config_dict["Authors"], config_dict["Crossref_search"]["mailto_email"], citation_match_ratio)
+        all_queries["Google Scholar"] = Google_Scholar_publication_dict
     if not no_Crossref:
         helper_functions.vprint("Searching Crossref.")
-        Crossref_publication_dict = athr_srch_webio.search_Crossref_for_pubs(current_pubs, config_dict["Authors"], config_dict["Crossref_search"]["mailto_email"])
-        current_pubs.update(Crossref_publication_dict)
+        running_pubs, Crossref_publication_dict = athr_srch_webio.search_Crossref_for_pubs(running_pubs, config_dict["Authors"], config_dict["Crossref_search"]["mailto_email"], citation_match_ratio)
+        all_queries["Crossref"] = Crossref_publication_dict
 
-    publication_dict = {}
+    ## Do a second pass using the saved queries.
     if not no_PubMed:
-        for key, value in PubMed_publication_dict.items():
-            if not key in publication_dict:
-                publication_dict[key] = value
+        running_pubs, PubMed_publication_dict = athr_srch_webio.search_PubMed_for_pubs(running_pubs, config_dict["Authors"], config_dict["PubMed_search"]["PubMed_email"], citation_match_ratio, all_queries["PubMed"])
     if not no_ORCID:
-        for key, value in ORCID_publication_dict.items():
-            if not key in publication_dict:
-                publication_dict[key] = value
+        running_pubs, ORCID_publication_dict = athr_srch_webio.search_ORCID_for_pubs(running_pubs, config_dict["ORCID_search"]["ORCID_key"], config_dict["ORCID_search"]["ORCID_secret"], config_dict["Authors"], citation_match_ratio, all_queries["ORCID"])
     if not no_GoogleScholar:
-        for key, value in Google_Scholar_publication_dict.items():
-            if not key in publication_dict:
-                publication_dict[key] = value
+        running_pubs, Google_Scholar_publication_dict = athr_srch_webio.search_Google_Scholar_for_pubs(running_pubs, config_dict["Authors"], config_dict["Crossref_search"]["mailto_email"], citation_match_ratio, all_queries["Google Scholar"])
     if not no_Crossref:
-        for key, value in Crossref_publication_dict.items():
-            if not key in publication_dict:
-                publication_dict[key] = value
-    
+        running_pubs, Crossref_publication_dict = athr_srch_webio.search_Crossref_for_pubs(running_pubs, config_dict["Authors"], config_dict["Crossref_search"]["mailto_email"], citation_match_ratio, all_queries["Crossref"])
+        
     ## Compare current pubs with previous and only keep those that are new or updated.
     for pub_id, pub_values in prev_pubs.items():
-        if pub_id in publication_dict and not deepdiff.DeepDiff(publication_dict[pub_id], pub_values, ignore_order=True, report_repetition=True):
-            del publication_dict[pub_id]
-        
+        if pub_id in running_pubs and not deepdiff.DeepDiff(running_pubs[pub_id], pub_values, ignore_order=True, report_repetition=True):
+            del running_pubs[pub_id]
 
-    if len(publication_dict) == 0:
+    if len(running_pubs) == 0:
         helper_functions.vprint("No new publications found.")
         sys.exit()
 
-    return publication_dict
+    ## Convert PubMed articles class to dicts so they can be saved as JSON.
+    if not no_PubMed:
+        for author, pub_list in all_queries["PubMed"].items():
+            new_list = []
+            for pub in pub_list:
+                new_list.append(helper_functions.modify_pub_dict_for_saving(pub, True))
+            all_queries["PubMed"][author] = new_list
+        
+    return running_pubs, all_queries