Skip to content

Commit 45dbbf0

Browse files
data_manager support for ncbi-amrfinderplus 4.0.x databases (#7404)
* data_manager support for ncbi-amrfinderplus 4.0.x databases * Python linting fix * Update data_managers/data_manager_build_amrfinderplus/data_manager/data_manager_build_amrfinderplus.xml Co-authored-by: M Bernt <[email protected]> --------- Co-authored-by: M Bernt <[email protected]>
1 parent b1e26e7 commit 45dbbf0

File tree

3 files changed

+177
-88
lines changed

3 files changed

+177
-88
lines changed

data_managers/data_manager_build_amrfinderplus/data_manager/data_manager_build_amrfinderplus.py

Lines changed: 150 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,13 @@ class GetAmrFinderPlusDataManager:
1414
Create the json file with database information for galaxy data manager
1515
"""
1616

17-
def __init__(self,
18-
amrfinderplus_database="amrfinderplus_versioned_database",
19-
db_name="amrfinderplus-db",
20-
amrfinderplus_version="latest",
21-
date_version=None):
17+
def __init__(
18+
self,
19+
amrfinderplus_database="amrfinderplus_versioned_database",
20+
db_name="amrfinderplus-db",
21+
amrfinderplus_version="latest",
22+
date_version=None,
23+
):
2224
self.data_table_name = amrfinderplus_database
2325
self._db_name = db_name
2426
self._amrfinderplus_version = amrfinderplus_version
@@ -31,11 +33,7 @@ def get_data_table_format(self):
3133
Skeleton of a data_table format
3234
return: a data table formatted for json output
3335
"""
34-
self.data_table_entry = {
35-
"data_tables": {
36-
self.data_table_name: {}
37-
}
38-
}
36+
self.data_table_entry = {"data_tables": {self.data_table_name: {}}}
3937
return self.data_table_entry
4038

4139
def get_data_manager(self):
@@ -44,14 +42,19 @@ def get_data_manager(self):
4442
return: The data table with database information
4543
"""
4644
self.amrfinderplus_table_list = self.get_data_table_format()
47-
amrfinderplus_value = f"amrfinderplus_V{self._amrfinderplus_version}" \
48-
f"_{self._amrfinderplus_date_version}"
49-
amrfinderplus_name = f"V{self._amrfinderplus_version}" \
50-
f"-{self._amrfinderplus_date_version}"
51-
data_info = dict(value=amrfinderplus_value,
52-
name=amrfinderplus_name,
53-
db_version=self._amrfinderplus_version,
54-
path=self._db_name)
45+
amrfinderplus_value = (
46+
f"amrfinderplus_V{self._amrfinderplus_version}"
47+
f"_{self._amrfinderplus_date_version}"
48+
)
49+
amrfinderplus_name = (
50+
f"V{self._amrfinderplus_version}" f"-{self._amrfinderplus_date_version}"
51+
)
52+
data_info = dict(
53+
value=amrfinderplus_value,
54+
name=amrfinderplus_name,
55+
db_version=self._amrfinderplus_version,
56+
path=self._db_name,
57+
)
5558
self.amrfinderplus_table_list["data_tables"][self.data_table_name] = [data_info]
5659
return self.amrfinderplus_table_list
5760

@@ -63,24 +66,28 @@ class DownloadAmrFinderPlusDatabase(GetAmrFinderPlusDataManager):
6366
Build the data manager infos for galaxy
6467
"""
6568

66-
def __init__(self,
67-
output_dir=Path.cwd(),
68-
ncbi_url="ftp.ncbi.nlm.nih.gov",
69-
ftp_login="anonymous",
70-
ftp_password="anonymous",
71-
amrfinderplus_database="amrfinderplus_database",
72-
db_name="amrfinderplus-db",
73-
amrfinderplus_version="latest",
74-
json_file_path=None,
75-
date_version=None,
76-
amrfinderplus_db_path=None,
77-
test_mode=False):
69+
def __init__(
70+
self,
71+
output_dir=Path.cwd(),
72+
ncbi_url="ftp.ncbi.nlm.nih.gov",
73+
ftp_login="anonymous",
74+
ftp_password="anonymous",
75+
amrfinderplus_database="amrfinderplus_database",
76+
db_name="amrfinderplus-db",
77+
amrfinderplus_version="latest",
78+
json_file_path=None,
79+
date_version=None,
80+
amrfinderplus_db_path=None,
81+
test_mode=False,
82+
):
7883

7984
super().__init__()
8085
self.json_file_path = json_file_path
8186
self._output_dir = output_dir
8287
self._ncbi_ftp_url = ncbi_url
83-
self._ncbi_database_path = "pathogen/Antimicrobial_resistance/AMRFinderPlus/database"
88+
self._ncbi_database_path = (
89+
"pathogen/Antimicrobial_resistance/AMRFinderPlus/database"
90+
)
8491
self._login = ftp_login
8592
self._password = ftp_password
8693
self._amrfinderplus_database = amrfinderplus_database
@@ -103,40 +110,61 @@ def subprocess_cmd(command, *args):
103110
[cmd.append(i) for i in args]
104111
proc = sp.run(cmd, stdout=sp.PIPE, stderr=sp.PIPE)
105112
if proc.returncode != 0:
106-
print(f'Error type {proc.returncode} with : \n {proc}')
113+
print(f"Error type {proc.returncode} with : \n {proc}")
107114

108115
def download_amrfinderplus_db(self):
109116
"""
110117
Download the amrfinderplus database from the ncbi ftp server
111118
"""
112-
self.amrfinderplus_db_path = f'{self._output_dir}/{self._db_name}'
119+
self.amrfinderplus_db_path = f"{self._output_dir}/{self._db_name}"
113120
os.makedirs(self.amrfinderplus_db_path)
114121

115-
amrfinderplus_ftp_path = f"ftp://{self._login}:" \
116-
f"{self._password}@{self._ncbi_ftp_url}/" \
117-
f"{self._ncbi_database_path}/" \
118-
f"{self._amrfinderplus_version}/" \
119-
f"{self._amrfinderplus_date_version}"
122+
if self._amrfinderplus_version == "latest":
123+
self.get_amrfinderplus_version()
124+
125+
amrfinderplus_ftp_path = (
126+
f"ftp://{self._login}:"
127+
f"{self._password}@{self._ncbi_ftp_url}/"
128+
f"{self._ncbi_database_path}/"
129+
f"{self._amrfinderplus_version}/"
130+
f"{self._amrfinderplus_date_version}"
131+
)
132+
133+
if self._amrfinderplus_version == "3.12":
134+
taxa_group_file = "taxgroup.tab"
135+
test_dna_fasta = "AMR_DNA-Escherichia"
136+
else:
137+
taxa_group_file = "taxgroup.tsv"
138+
test_dna_fasta = "AMR_DNA-Escherichia.fa"
120139
if self.test_mode is True:
121-
file_list = ["AMR_DNA-Escherichia", "version.txt", "taxgroup.tab", "database_format_version.txt"]
140+
file_list = [
141+
test_dna_fasta,
142+
"version.txt",
143+
taxa_group_file,
144+
"database_format_version.txt",
145+
]
122146
output_option = "-O"
123147
for file in file_list:
124-
self.subprocess_cmd("wget",
125-
"-nd",
126-
"-np",
127-
"-r",
128-
f"{amrfinderplus_ftp_path}/{file}",
129-
output_option,
130-
f"{self.amrfinderplus_db_path}/{file}")
148+
self.subprocess_cmd(
149+
"wget",
150+
"-nd",
151+
"-np",
152+
"-r",
153+
f"{amrfinderplus_ftp_path}/{file}",
154+
output_option,
155+
f"{self.amrfinderplus_db_path}/{file}",
156+
)
131157
else:
132158
output_option = "-P"
133-
self.subprocess_cmd("wget",
134-
"-nd",
135-
"-np",
136-
"-r",
137-
amrfinderplus_ftp_path,
138-
output_option,
139-
self.amrfinderplus_db_path)
159+
self.subprocess_cmd(
160+
"wget",
161+
"-nd",
162+
"-np",
163+
"-r",
164+
amrfinderplus_ftp_path,
165+
output_option,
166+
self.amrfinderplus_db_path,
167+
)
140168

141169
def make_hmm_profile(self):
142170
"""
@@ -153,27 +181,48 @@ def extract_filelist_makeblast(self):
153181
Extract le list of species which have file in the database
154182
return: a filtered species list of available species in the database
155183
"""
156-
taxa_group_path = Path(f"{self.amrfinderplus_db_path}/taxgroup.tab")
184+
if self._amrfinderplus_version == "3.12":
185+
taxa_group_file = "taxgroup.tab"
186+
else:
187+
taxa_group_file = "taxgroup.tsv"
188+
taxa_group_path = Path(f"{self.amrfinderplus_db_path}/{taxa_group_file}")
157189
if Path.exists(taxa_group_path):
158190
taxa_table = pd.read_table(taxa_group_path)
159-
taxa_table.columns = ["taxgroup", "gpipe_taxgroup", "number_of_nucl_ref_genes"]
160-
taxa_df = taxa_table[taxa_table.number_of_nucl_ref_genes > 0].filter(items=["taxgroup"], axis=1)
191+
taxa_table.columns = [
192+
"taxgroup",
193+
"gpipe_taxgroup",
194+
"number_of_nucl_ref_genes",
195+
]
196+
taxa_df = taxa_table[taxa_table.number_of_nucl_ref_genes > 0].filter(
197+
items=["taxgroup"], axis=1
198+
)
161199
if self.test_mode is True:
162200
taxa_df = taxa_df[taxa_df.taxgroup == "Escherichia"].taxgroup
163201
else:
164202
taxa_df = taxa_df.taxgroup
165203
self.species_list = list(taxa_df)
166204
else:
167-
print("taxgroup.tab file is missing to list available species")
205+
print(f"{taxa_group_file} file is missing to list available species")
168206

169207
def make_blastdb(self):
170208
"""
171209
Index fasta file for blast
172210
"""
173211
self.extract_filelist_makeblast()
174-
nucl_file_db_list = [f'{self.amrfinderplus_db_path}/AMR_DNA-{specie}' for specie in self.species_list]
175-
amr_dna = f'{self.amrfinderplus_db_path}/AMR_CDS'
176-
amr_prot = f'{self.amrfinderplus_db_path}/AMRProt'
212+
if self._amrfinderplus_version == "3.12":
213+
nucl_file_db_list = [
214+
f"{self.amrfinderplus_db_path}/AMR_DNA-{specie}"
215+
for specie in self.species_list
216+
]
217+
amr_dna = f"{self.amrfinderplus_db_path}/AMR_CDS"
218+
amr_prot = f"{self.amrfinderplus_db_path}/AMRProt"
219+
else:
220+
nucl_file_db_list = [
221+
f"{self.amrfinderplus_db_path}/AMR_DNA-{specie}.fa"
222+
for specie in self.species_list
223+
]
224+
amr_dna = f"{self.amrfinderplus_db_path}/AMR_CDS.fa"
225+
amr_prot = f"{self.amrfinderplus_db_path}/AMRProt.fa"
177226
os.chdir(self.amrfinderplus_db_path)
178227
if Path(amr_dna).exists():
179228
nucl_file_db_list.append(amr_dna)
@@ -183,10 +232,16 @@ def make_blastdb(self):
183232
self.subprocess_cmd("makeblastdb", "-in", amr_prot, "-dbtype", "prot")
184233
else:
185234
print("No file AMRProt detected for indexing")
186-
[self.subprocess_cmd("makeblastdb", "-in", file, "-dbtype", "nucl") for file in nucl_file_db_list]
235+
[
236+
self.subprocess_cmd("makeblastdb", "-in", file, "-dbtype", "nucl")
237+
for file in nucl_file_db_list
238+
]
187239

188-
def get_amrfinderplus_version(self, version_file="version.txt",
189-
database_version_file="database_format_version.txt"):
240+
def get_amrfinderplus_version(
241+
self,
242+
version_file="version.txt",
243+
database_version_file="database_format_version.txt",
244+
):
190245
"""
191246
Check the version when latest if provided and update the number
192247
param version_file: name of the file containing version information
@@ -197,27 +252,30 @@ def get_amrfinderplus_version(self, version_file="version.txt",
197252
ftp.cwd(f"{self._ncbi_database_path}/{self._amrfinderplus_version}")
198253
db_version = BytesIO()
199254
db_date_version = BytesIO()
200-
ftp.retrbinary(f'RETR {version_file}', db_version.write)
201-
ftp.retrbinary(f'RETR {database_version_file}', db_date_version.write)
202-
self._amrfinderplus_date_version = db_version.getvalue().decode("utf-8").splitlines()[0]
203-
self._amrfinderplus_version = '.'.join(
204-
db_date_version.getvalue().decode("utf-8").splitlines()[0].split(".")[:2])
255+
ftp.retrbinary(f"RETR {version_file}", db_version.write)
256+
ftp.retrbinary(f"RETR {database_version_file}", db_date_version.write)
257+
self._amrfinderplus_date_version = (
258+
db_version.getvalue().decode("utf-8").splitlines()[0]
259+
)
260+
self._amrfinderplus_version = ".".join(
261+
db_date_version.getvalue().decode("utf-8").splitlines()[0].split(".")[:2]
262+
)
205263

206264
def read_json_input_file(self):
207265
"""
208266
Import the json file
209267
"""
210268
with open(self.json_file_path) as fh:
211269
params = json.load(fh)
212-
target_dir = params['output_data'][0]['extra_files_path']
270+
target_dir = params["output_data"][0]["extra_files_path"]
213271
os.makedirs(target_dir)
214272
self._output_dir = target_dir
215273

216274
def write_json_infos(self):
217275
"""
218276
Write in the imported json file
219277
"""
220-
with open(self.json_file_path, 'w') as fh:
278+
with open(self.json_file_path, "w") as fh:
221279
json.dump(self.get_data_manager(), fh, sort_keys=True)
222280

223281

@@ -228,29 +286,38 @@ def parse_arguments():
228286
"""
229287
# parse options and arguments
230288
arg_parser = argparse.ArgumentParser()
231-
arg_parser.add_argument("data_manager_json",
232-
help="json file from galaxy")
233-
arg_parser.add_argument("--db_version", default="latest",
234-
help="select the major version of the database (e.g. 3.10, 3.8), default is latest")
235-
arg_parser.add_argument("--db_date",
236-
help="select the date into the database version (e.g. 2022-10-11.2)")
237-
arg_parser.add_argument("--test", action='store_true',
238-
help="option to test the script with an lighted database")
289+
arg_parser.add_argument("data_manager_json", help="json file from galaxy")
290+
arg_parser.add_argument(
291+
"--db_version",
292+
default="latest",
293+
help="select the major version of the database (e.g. 3.10, 3.8), default is latest",
294+
)
295+
arg_parser.add_argument(
296+
"--db_date",
297+
help="select the date into the database version (e.g. 2022-10-11.2)",
298+
)
299+
arg_parser.add_argument(
300+
"--test",
301+
action="store_true",
302+
help="option to test the script with an lighted database",
303+
)
239304
return arg_parser.parse_args()
240305

241306

242307
def main():
243308
all_args = parse_arguments()
244-
amrfinderplus_download = DownloadAmrFinderPlusDatabase(amrfinderplus_version=all_args.db_version,
245-
date_version=all_args.db_date,
246-
json_file_path=all_args.data_manager_json,
247-
test_mode=all_args.test)
309+
amrfinderplus_download = DownloadAmrFinderPlusDatabase(
310+
amrfinderplus_version=all_args.db_version,
311+
date_version=all_args.db_date,
312+
json_file_path=all_args.data_manager_json,
313+
test_mode=all_args.test,
314+
)
248315
amrfinderplus_download.read_json_input_file()
249316
amrfinderplus_download.download_amrfinderplus_db()
250317
amrfinderplus_download.make_hmm_profile()
251318
amrfinderplus_download.make_blastdb()
252319
amrfinderplus_download.write_json_infos()
253320

254321

255-
if __name__ == '__main__':
322+
if __name__ == "__main__":
256323
main()

0 commit comments

Comments
 (0)