Skip to content

Commit 7ddaba8

Browse files
author
s2010515
committed
Update code, add keywords, see version 0.3.14
1 parent d472308 commit 7ddaba8

File tree

9 files changed

+110
-9
lines changed

9 files changed

+110
-9
lines changed

README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,8 @@ The dataframe columns are:
136136
- Abstract (from PubMed metadata).
137137
- mesh <class 'list'>
138138
- MeSH (Medical Subject Headings) provided by Medline.
139+
- keywords <class 'list'>
140+
- This field contains largely non-MeSH subject terms that describe the content of an article. Beginning in January 2013, author-supplied keywords.
139141
- authors <class 'list'>
140142
- journal <class 'str'>
141143
- pub_type <class 'list'>
@@ -272,6 +274,13 @@ A: It seems that you are on a shared computer, you need to identify who is the o
272274

273275
## Version
274276

277+
### Version 0.3.14
278+
-> Add the keyword field from the medline file to the result.
279+
280+
-> Fixed data type, when reading the medline file, in case of add_mesh.
281+
282+
-> Fixed code where 1 article was missing if using list of PMIDs as update.
283+
275284
### Version 0.3.13
276285
-> Since Crossref retired the API key feature to let Elsevier and Wiley identified the author of the publication request. wiley_api_key and elsevier_api_key optional parameters have been added as input parameters. These are not mandatory parameters but increase greatly the retrieval rate as they give access to Wiley and Elsevier publications respectively.
277286

cadmus/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,3 +63,4 @@
6363
from cadmus.post_retrieval.parsed_to_df import parsed_to_df
6464
from cadmus.retrieval.edirect import pipeline
6565
from cadmus.pre_retrieval.display_export_path import display_export_path
66+
from cadmus.pre_retrieval.add_keywords import add_keywords

cadmus/main/bioscraping.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
from cadmus.post_retrieval.clean_up_dir import clean_up_dir
3838
from cadmus.pre_retrieval.add_mesh_remove_preprint import add_mesh_remove_preprint
3939
from cadmus.pre_retrieval.change_output_structure import change_output_structure
40+
from cadmus.pre_retrieval.add_keywords import add_keywords
4041

4142
def bioscraping(input_function, email, api_key, wiley_api_key = None, elsevier_api_key = None, start = None, idx = None , full_search = None, keep_abstract = True, click_through_api_key = 'XXXXXXXX-XXXXXXXX-XXXXXXXX-XXXXXXXX'):
4243
# first bioscraping checks whether this is an update of a previous search or a new search.
@@ -57,6 +58,9 @@ def bioscraping(input_function, email, api_key, wiley_api_key = None, elsevier_a
5758
if 'mesh' not in original_df.columns:
5859
print('Implementing changes to your previous result due to change in the library.')
5960
original_df = add_mesh_remove_preprint(original_df)
61+
if 'keywords' not in original_df.columns:
62+
print('Implementing changes to your previous result due to change in the library.')
63+
original_df = add_keywords(original_df)
6064
if original_df.iloc[0].content_text == 0 or original_df.iloc[0].content_text == 1:
6165
pass
6266
else:

cadmus/pre_retrieval/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,5 @@
77
from cadmus.pre_retrieval.check_for_retrieved_df import check_for_retrieved_df
88
from cadmus.pre_retrieval.add_mesh_remove_preprint import add_mesh_remove_preprint
99
from cadmus.pre_retrieval.change_output_structure import change_output_structure
10-
from cadmus.pre_retrieval.display_export_path import display_export_path
10+
from cadmus.pre_retrieval.display_export_path import display_export_path
11+
from cadmus.pre_retrieval.add_keywords import add_keywords
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
import json
2+
import pandas as pd
3+
import subprocess
4+
import zipfile
5+
import glob
6+
import os
7+
8+
def add_keywords(df):
9+
10+
#retrieving the names of the file present in the medline file to extract previously fectched mesh terms
11+
command = subprocess.getstatusoutput(f"ls -lR ./output/medline/txts")
12+
command = list(command)
13+
command = command[1]
14+
command = str(command).split('\n')
15+
my_medline_files = []
16+
for i in range(2,len(command)):
17+
my_medline_files.append(command[i].split()[-1])
18+
19+
total_list = []
20+
for i in range(len(my_medline_files)):
21+
my_file = ''
22+
with zipfile.ZipFile(f"./output/medline/txts/{my_medline_files[i]}", "r") as z:
23+
for filename in z.namelist():
24+
with z.open(filename) as f:
25+
my_file = f.read()
26+
f.close()
27+
z.close()
28+
total_list.append(str(str(my_file.decode('utf-8'))))
29+
30+
total_list = total_list[0].split('\n')
31+
32+
my_pmid_filtered = []
33+
my_kw_filtered = []
34+
current_kw = []
35+
current = False
36+
for i in range(len(total_list)):
37+
if total_list[i][:4] == 'PMID' and current == False:
38+
my_pmid_filtered.append(total_list[i])
39+
current = True
40+
if total_list[i][:2] == 'OT' and total_list[i][:3] != 'OTO':
41+
current_kw.append(total_list[i])
42+
if total_list[i][:4] == 'PMID' and current == True:
43+
my_kw_filtered.append(current_kw)
44+
current_kw = []
45+
my_pmid_filtered.append(total_list[i])
46+
my_kw_filtered.append(current_kw)
47+
48+
for i in range(len(my_pmid_filtered)):
49+
my_pmid_filtered[i] = my_pmid_filtered[i].replace('PMID- ', '')
50+
for i in range(len(my_kw_filtered)):
51+
for j in range(len(my_kw_filtered[i])):
52+
my_kw_filtered[i][j] = my_kw_filtered[i][j].replace('OT - ', '')
53+
54+
df_keywords = pd.DataFrame(list(zip(my_pmid_filtered, my_kw_filtered)),
55+
columns =['pmid', 'keywords'])
56+
57+
df_keywords = df_keywords.drop_duplicates(subset=['pmid'])
58+
for index, row in df_keywords.iterrows():
59+
if df_keywords.keywords.loc[index] == []:
60+
df_keywords.loc[index, 'keywords'] = None
61+
62+
df = df.reset_index().merge(df_keywords, on='pmid').set_index('index')
63+
df = df[['pmid', 'pmcid', 'title', 'abstract', 'mesh', 'keywords', 'authors', 'journal', 'pub_type', 'pub_date', 'doi', 'issn', 'crossref', 'full_text_links', 'licenses', 'pdf', 'xml', 'html', 'plain', 'pmc_tgz', 'xml_parse_d', 'html_parse_d', 'pdf_parse_d', 'plain_parse_d', 'content_text']]
64+
65+
df.pub_date = df.pub_date.astype(str)
66+
result = df.to_json(orient="index")
67+
if len(glob.glob('./output/retrieved_df/retrieved_df2.json.zip')) == 0:
68+
with zipfile.ZipFile("./output/retrieved_df/retrieved_df2.json.zip", mode="w", compression=zipfile.ZIP_DEFLATED, compresslevel=9) as zip_file:
69+
dumped_JSON: str = json.dumps(result, indent=4)
70+
zip_file.writestr("retrieved_df2.json", data=dumped_JSON)
71+
zip_file.testzip()
72+
zip_file.close()
73+
else:
74+
os.rename('./output/retrieved_df/retrieved_df2.json.zip', './output/retrieved_df/temp_retrieved_df2.json.zip')
75+
with zipfile.ZipFile("./output/retrieved_df/retrieved_df2.json.zip", mode="w", compression=zipfile.ZIP_DEFLATED, compresslevel=9) as zip_file:
76+
dumped_JSON: str = json.dumps(result, indent=4)
77+
zip_file.writestr("retrieved_df2.json", data=dumped_JSON)
78+
zip_file.testzip()
79+
zip_file.close()
80+
os.remove('./output/retrieved_df/temp_retrieved_df2.json.zip')
81+
82+
return df

cadmus/pre_retrieval/add_mesh_remove_preprint.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,12 @@ def add_mesh_remove_preprint(df):
2626
my_file = f.read()
2727
f.close()
2828
z.close()
29-
total_list.extend(my_file)
29+
total_list.append(str(str(my_file.decode('utf-8'))))
3030

31-
for i in range(len(total_list)):
32-
total_list[i] = total_list[i].replace('\n', '')
31+
total_list = total_list[0].split('\n')
3332

3433
my_pmid_filtered = []
3534
my_mh_filtered = []
36-
current_pmid = []
3735
current_mh = []
3836
current = False
3937
for i in range(len(total_list)):
@@ -58,8 +56,11 @@ def add_mesh_remove_preprint(df):
5856
columns =['pmid', 'mesh'])
5957

6058
df_mesh = df_mesh.drop_duplicates(subset=['pmid'])
59+
for index, row in df_mesh.iterrows():
60+
if df_mesh.mesh.loc[index] == []:
61+
df_mesh.loc[index, 'mesh'] = None
6162

62-
df = df.merge(df_mesh, on='pmid')
63+
df = df.reset_index().merge(df_mesh, on='pmid').set_index('index')
6364
df = df[['pmid', 'pmcid', 'title', 'abstract', 'mesh', 'authors', 'journal', 'pub_type', 'pub_date', 'doi', 'issn', 'crossref', 'full_text_links', 'licenses', 'pdf', 'xml', 'html', 'plain', 'pmc_tgz', 'xml_parse_d', 'html_parse_d', 'pdf_parse_d', 'plain_parse_d', 'content_text']]
6465

6566
index_to_keep = []

cadmus/pre_retrieval/creation_retrieved_df.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ def creation_retrieved_df(medline_file_name):
6363
if abstract == None or abstract == '':
6464
abstract = record.get('OAB')
6565
mesh_terms = record.get('MH')
66+
keywords = record.get('OT')
6667
authors = record.get('AU')
6768
journal_title = record.get('JT')
6869
pub_type = record.get('PT')
@@ -81,6 +82,7 @@ def creation_retrieved_df(medline_file_name):
8182
'title': title,
8283
'abstract': abstract,
8384
'mesh': mesh_terms,
85+
'keywords': keywords,
8486
'authors':authors,
8587
'journal':journal_title,
8688
'pub_type':pub_type,

cadmus/retrieval/search_terms_to_medline.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,13 +52,14 @@ def search_terms_to_medline(query_string, api_key):
5252
d = f.read()
5353
f.close()
5454
z.close()
55-
d = str(str(d.decode('utf-8')) + str(search_results)).encode('utf-8')
55+
d = str(str(d.decode('utf-8')) + '\n' + '\n' + str(search_results)).encode('utf-8')
5656
os.rename('./output/medline/txts/medline_output.txt.zip', './output/medline/txts/temp_medline_output.txt.zip')
5757
with zipfile.ZipFile("./output/medline/txts/medline_output.txt.zip", mode="a", compression=zipfile.ZIP_DEFLATED, compresslevel=9) as zip_file:
5858
zip_file.writestr("medline_output.txt", data=d)
5959
zip_file.testzip()
6060
zip_file.close()
6161
os.remove('./output/medline/txts/temp_medline_output.txt.zip')
62+
print('Medline Records retrieved and saved')
6263
else:
6364
#to avoid errors for large pmids list. We now chunk into smaller set of 9000. Finally we append every chunk in the medline text file.
6465
for i in range(len(query_string)):
@@ -76,7 +77,7 @@ def search_terms_to_medline(query_string, api_key):
7677
d = f.read()
7778
f.close()
7879
z.close()
79-
d = str(str(d.decode('utf-8')) + str(search_results)).encode('utf-8')
80+
d = str(str(d.decode('utf-8')) + '\n' + '\n' + str(search_results)).encode('utf-8')
8081
os.rename('./output/medline/txts/medline_output.txt.zip', './output/medline/txts/temp_medline_output.txt.zip')
8182
with zipfile.ZipFile("./output/medline/txts/medline_output.txt.zip", mode="a", compression=zipfile.ZIP_DEFLATED, compresslevel=9) as zip_file:
8283
zip_file.writestr("medline_output.txt", data=d)

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
setuptools.setup(
44
name="cadmus",
5-
version="0.3.13",
5+
version="0.3.14",
66
author="Jamie Campbell, Ian Simpson, Antoine Lain",
77
author_email="Jamie.campbell@igmm.ed.ac.uk, Ian.Simpson@ed.ac.uk, Antoine.Lain@ed.ac.uk",
88
description="This projects is to build full text retrieval system setup for generation of large biomedical corpora from published literature.",

0 commit comments

Comments
 (0)