Update code, add keywords, see version 0.3.14

s2010515 · s2010515 · commit 7ddaba8abfe0 · 2023-07-11T15:16:42.000Z
diff --git a/README.md b/README.md
@@ -136,6 +136,8 @@ The dataframe columns are:
   - Abstract (from PubMed metadata). 
 - mesh <class 'list'>
   -  MeSH (Medical Subject Headings) provided by Medline.
+- keywords <class 'list'>
+  - This field contains largely non-MeSH subject terms that describe the content of an article. Beginning in January 2013, author-supplied keywords.
 - authors <class 'list'>
 - journal <class 'str'>
 - pub_type <class 'list'>
@@ -272,6 +274,13 @@ A: It seems that you are on a shared computer, you need to identify who is the o
 
 ## Version
 
+### Version 0.3.14
+-> Add the keyword field from the medline file to the result.
+
+-> Fixed data type, when reading the medline file, in case of add_mesh.
+
+-> Fixed code where 1 article was missing if using list of PMIDs as update.
+
 ### Version 0.3.13
 -> Since Crossref retired the API key feature to let Elsevier and Wiley identified the author of the publication request. wiley_api_key and elsevier_api_key optional parameters have been added as input parameters. These are not mandatory parameters but increase greatly the retrieval rate as they give access to Wiley and Elsevier publications respectively. 
 
diff --git a/cadmus/__init__.py b/cadmus/__init__.py
@@ -63,3 +63,4 @@
 from cadmus.post_retrieval.parsed_to_df import parsed_to_df
 from cadmus.retrieval.edirect import pipeline
 from cadmus.pre_retrieval.display_export_path import display_export_path
+from cadmus.pre_retrieval.add_keywords import add_keywords
diff --git a/cadmus/main/bioscraping.py b/cadmus/main/bioscraping.py
@@ -37,6 +37,7 @@
 from cadmus.post_retrieval.clean_up_dir import clean_up_dir
 from cadmus.pre_retrieval.add_mesh_remove_preprint import add_mesh_remove_preprint
 from cadmus.pre_retrieval.change_output_structure import change_output_structure
+from cadmus.pre_retrieval.add_keywords import add_keywords
 
 def bioscraping(input_function, email, api_key, wiley_api_key = None, elsevier_api_key = None, start = None, idx = None , full_search = None, keep_abstract = True, click_through_api_key = 'XXXXXXXX-XXXXXXXX-XXXXXXXX-XXXXXXXX'):
     # first bioscraping checks whether this is an update of a previous search or a new search.
@@ -57,6 +58,9 @@ def bioscraping(input_function, email, api_key, wiley_api_key = None, elsevier_a
         if 'mesh' not in original_df.columns:
             print('Implementing changes to your previous result due to change in the library.')
             original_df = add_mesh_remove_preprint(original_df)
+        if 'keywords' not in original_df.columns:
+            print('Implementing changes to your previous result due to change in the library.')
+            original_df = add_keywords(original_df)
         if original_df.iloc[0].content_text == 0 or original_df.iloc[0].content_text == 1:
             pass
         else:
diff --git a/cadmus/pre_retrieval/__init__.py b/cadmus/pre_retrieval/__init__.py
@@ -7,4 +7,5 @@
 from cadmus.pre_retrieval.check_for_retrieved_df import check_for_retrieved_df
 from cadmus.pre_retrieval.add_mesh_remove_preprint import add_mesh_remove_preprint
 from cadmus.pre_retrieval.change_output_structure import change_output_structure
-from cadmus.pre_retrieval.display_export_path import display_export_path
+from cadmus.pre_retrieval.display_export_path import display_export_path
+from cadmus.pre_retrieval.add_keywords import add_keywords
diff --git a/cadmus/pre_retrieval/add_keywords.py b/cadmus/pre_retrieval/add_keywords.py
@@ -0,0 +1,82 @@
+import json
+import pandas as pd
+import subprocess
+import zipfile
+import glob
+import os
+
+def add_keywords(df):
+
+    #retrieving the names of the file present in the medline file to extract previously fectched mesh terms
+    command = subprocess.getstatusoutput(f"ls -lR ./output/medline/txts")
+    command = list(command)
+    command = command[1]
+    command = str(command).split('\n')
+    my_medline_files = []
+    for i in range(2,len(command)):
+        my_medline_files.append(command[i].split()[-1])
+
+    total_list = []
+    for i in range(len(my_medline_files)):
+        my_file = ''
+        with zipfile.ZipFile(f"./output/medline/txts/{my_medline_files[i]}", "r") as z:
+            for filename in z.namelist():
+                with z.open(filename) as f:
+                    my_file = f.read()
+                f.close()
+        z.close()
+        total_list.append(str(str(my_file.decode('utf-8'))))
+
+    total_list = total_list[0].split('\n')
+    
+    my_pmid_filtered = []
+    my_kw_filtered = []
+    current_kw = []
+    current = False
+    for i in range(len(total_list)):
+        if total_list[i][:4] == 'PMID' and current == False:
+            my_pmid_filtered.append(total_list[i])
+            current = True
+        if total_list[i][:2] == 'OT' and total_list[i][:3] != 'OTO':
+            current_kw.append(total_list[i])
+        if total_list[i][:4] == 'PMID' and current == True:
+            my_kw_filtered.append(current_kw)
+            current_kw = []
+            my_pmid_filtered.append(total_list[i])
+    my_kw_filtered.append(current_kw)
+
+    for i in range(len(my_pmid_filtered)):
+        my_pmid_filtered[i] = my_pmid_filtered[i].replace('PMID- ', '')
+    for i in range(len(my_kw_filtered)):
+        for j in range(len(my_kw_filtered[i])):
+            my_kw_filtered[i][j] = my_kw_filtered[i][j].replace('OT  - ', '')
+
+    df_keywords = pd.DataFrame(list(zip(my_pmid_filtered, my_kw_filtered)),
+               columns =['pmid', 'keywords'])
+    
+    df_keywords = df_keywords.drop_duplicates(subset=['pmid'])
+    for index, row in df_keywords.iterrows():
+        if df_keywords.keywords.loc[index] == []:
+            df_keywords.loc[index, 'keywords'] = None
+
+    df = df.reset_index().merge(df_keywords, on='pmid').set_index('index')
+    df = df[['pmid', 'pmcid', 'title', 'abstract', 'mesh', 'keywords', 'authors', 'journal', 'pub_type', 'pub_date', 'doi', 'issn', 'crossref', 'full_text_links', 'licenses', 'pdf', 'xml', 'html', 'plain', 'pmc_tgz', 'xml_parse_d', 'html_parse_d', 'pdf_parse_d', 'plain_parse_d', 'content_text']]
+
+    df.pub_date = df.pub_date.astype(str)
+    result = df.to_json(orient="index")
+    if len(glob.glob('./output/retrieved_df/retrieved_df2.json.zip')) == 0:
+        with zipfile.ZipFile("./output/retrieved_df/retrieved_df2.json.zip", mode="w", compression=zipfile.ZIP_DEFLATED, compresslevel=9) as zip_file:
+            dumped_JSON: str = json.dumps(result, indent=4)
+            zip_file.writestr("retrieved_df2.json", data=dumped_JSON)
+            zip_file.testzip()
+        zip_file.close()
+    else:
+        os.rename('./output/retrieved_df/retrieved_df2.json.zip', './output/retrieved_df/temp_retrieved_df2.json.zip')
+        with zipfile.ZipFile("./output/retrieved_df/retrieved_df2.json.zip", mode="w", compression=zipfile.ZIP_DEFLATED, compresslevel=9) as zip_file:
+            dumped_JSON: str = json.dumps(result, indent=4)
+            zip_file.writestr("retrieved_df2.json", data=dumped_JSON)
+            zip_file.testzip()
+        zip_file.close()
+        os.remove('./output/retrieved_df/temp_retrieved_df2.json.zip')
+    
+    return df
diff --git a/cadmus/pre_retrieval/add_mesh_remove_preprint.py b/cadmus/pre_retrieval/add_mesh_remove_preprint.py
@@ -26,14 +26,12 @@ def add_mesh_remove_preprint(df):
                     my_file = f.read()
                 f.close()
         z.close()
-        total_list.extend(my_file)
+        total_list.append(str(str(my_file.decode('utf-8'))))
 
-    for i in range(len(total_list)):
-        total_list[i] = total_list[i].replace('\n', '')
+    total_list = total_list[0].split('\n')
     
     my_pmid_filtered = []
     my_mh_filtered = []
-    current_pmid = []
     current_mh = []
     current = False
     for i in range(len(total_list)):
@@ -58,8 +56,11 @@ def add_mesh_remove_preprint(df):
                columns =['pmid', 'mesh'])
     
     df_mesh = df_mesh.drop_duplicates(subset=['pmid'])
+    for index, row in df_mesh.iterrows():
+        if df_mesh.mesh.loc[index] == []:
+            df_mesh.loc[index, 'mesh'] = None
 
-    df = df.merge(df_mesh, on='pmid')
+    df = df.reset_index().merge(df_mesh, on='pmid').set_index('index')
     df = df[['pmid', 'pmcid', 'title', 'abstract', 'mesh', 'authors', 'journal', 'pub_type', 'pub_date', 'doi', 'issn', 'crossref', 'full_text_links', 'licenses', 'pdf', 'xml', 'html', 'plain', 'pmc_tgz', 'xml_parse_d', 'html_parse_d', 'pdf_parse_d', 'plain_parse_d', 'content_text']]
     
     index_to_keep = []
diff --git a/cadmus/pre_retrieval/creation_retrieved_df.py b/cadmus/pre_retrieval/creation_retrieved_df.py
@@ -63,6 +63,7 @@ def creation_retrieved_df(medline_file_name):
         if abstract == None or abstract == '':
             abstract = record.get('OAB')
         mesh_terms = record.get('MH')
+        keywords = record.get('OT')
         authors = record.get('AU')
         journal_title = record.get('JT')
         pub_type = record.get('PT')
@@ -81,6 +82,7 @@ def creation_retrieved_df(medline_file_name):
                             'title': title,
                             'abstract': abstract,
                             'mesh': mesh_terms,
+                            'keywords': keywords,
                             'authors':authors,
                             'journal':journal_title,
                             'pub_type':pub_type,
diff --git a/cadmus/retrieval/search_terms_to_medline.py b/cadmus/retrieval/search_terms_to_medline.py
@@ -52,13 +52,14 @@ def search_terms_to_medline(query_string, api_key):
                         d = f.read()
                     f.close()
             z.close()
-            d = str(str(d.decode('utf-8')) + str(search_results)).encode('utf-8')
+            d = str(str(d.decode('utf-8')) + '\n' + '\n' + str(search_results)).encode('utf-8')
             os.rename('./output/medline/txts/medline_output.txt.zip', './output/medline/txts/temp_medline_output.txt.zip')
             with zipfile.ZipFile("./output/medline/txts/medline_output.txt.zip", mode="a", compression=zipfile.ZIP_DEFLATED, compresslevel=9) as zip_file:
                 zip_file.writestr("medline_output.txt", data=d)
                 zip_file.testzip()
             zip_file.close()
             os.remove('./output/medline/txts/temp_medline_output.txt.zip')
+            print('Medline Records retrieved and saved')
     else:
         #to avoid errors for large pmids list. We now chunk into smaller set of 9000. Finally we append every chunk in the medline text file.
         for i in range(len(query_string)):
@@ -76,7 +77,7 @@ def search_terms_to_medline(query_string, api_key):
                             d = f.read()
                         f.close()
                 z.close()
-                d = str(str(d.decode('utf-8')) + str(search_results)).encode('utf-8')
+                d = str(str(d.decode('utf-8')) + '\n' + '\n' + str(search_results)).encode('utf-8')
                 os.rename('./output/medline/txts/medline_output.txt.zip', './output/medline/txts/temp_medline_output.txt.zip')
                 with zipfile.ZipFile("./output/medline/txts/medline_output.txt.zip", mode="a", compression=zipfile.ZIP_DEFLATED, compresslevel=9) as zip_file:
                     zip_file.writestr("medline_output.txt", data=d)
diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
    
 setuptools.setup(
     name="cadmus",
-    version="0.3.13",
+    version="0.3.14",
     author="Jamie Campbell, Ian Simpson, Antoine Lain",
     author_email="Jamie.campbell@igmm.ed.ac.uk, Ian.Simpson@ed.ac.uk, Antoine.Lain@ed.ac.uk",
     description="This projects is to build full text retrieval system setup for generation of large biomedical corpora from published literature.",