Skip to content

Commit 8be2f19

Browse files
committed
fix trimming and joining
1 parent e5504a7 commit 8be2f19

File tree

1 file changed

+59
-76
lines changed

1 file changed

+59
-76
lines changed

microSALT/utils/referencer.py

Lines changed: 59 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,8 @@ def __init__(self, config, log, sampleinfo={}, force=False):
4646
self.sample = self.sampleinfo
4747
self.client = PubMLSTClient()
4848

49-
5049
def identify_new(self, cg_id="", project=False):
51-
""" Automatically downloads pubMLST & NCBI organisms not already downloaded """
50+
"""Automatically downloads pubMLST & NCBI organisms not already downloaded"""
5251
neworgs = list()
5352
newrefs = list()
5453
try:
@@ -91,9 +90,7 @@ def index_db(self, full_dir, suffix):
9190
"""Check for indexation, makeblastdb job if not enough of them."""
9291
reindexation = False
9392
files = os.listdir(full_dir)
94-
sufx_files = glob.glob(
95-
"{}/*{}".format(full_dir, suffix)
96-
) # List of source files
93+
sufx_files = glob.glob("{}/*{}".format(full_dir, suffix)) # List of source files
9794
for file in sufx_files:
9895
subsuf = "\{}$".format(suffix)
9996
base = re.sub(subsuf, "", file)
@@ -105,10 +102,7 @@ def index_db(self, full_dir, suffix):
105102
if os.path.basename(base) == elem[: elem.rfind(".")]:
106103
bases = bases + 1
107104
# Number of index files fresher than source (6)
108-
if (
109-
os.stat(file).st_mtime
110-
< os.stat("{}/{}".format(full_dir, elem)).st_mtime
111-
):
105+
if os.stat(file).st_mtime < os.stat("{}/{}".format(full_dir, elem)).st_mtime:
112106
newer = newer + 1
113107
# 7 for parse_seqids, 4 for not.
114108
if not (bases == 7 or newer == 6) and not (bases == 4 and newer == 3):
@@ -121,18 +115,16 @@ def index_db(self, full_dir, suffix):
121115
)
122116
# MLST locis
123117
else:
124-
bash_cmd = "makeblastdb -in {}/{} -dbtype nucl -parse_seqids -out {}".format(
125-
full_dir, os.path.basename(file), os.path.basename(base)
118+
bash_cmd = (
119+
"makeblastdb -in {}/{} -dbtype nucl -parse_seqids -out {}".format(
120+
full_dir, os.path.basename(file), os.path.basename(base)
121+
)
126122
)
127-
proc = subprocess.Popen(
128-
bash_cmd.split(), cwd=full_dir, stdout=subprocess.PIPE
129-
)
123+
proc = subprocess.Popen(bash_cmd.split(), cwd=full_dir, stdout=subprocess.PIPE)
130124
proc.communicate()
131125
except Exception as e:
132126
self.logger.error(
133-
"Unable to index requested target {} in {}".format(
134-
file, full_dir
135-
)
127+
"Unable to index requested target {} in {}".format(file, full_dir)
136128
)
137129
if reindexation:
138130
self.logger.info("Re-indexed contents of {}".format(full_dir))
@@ -145,7 +137,7 @@ def fetch_external(self, force=False):
145137
for entry in root:
146138
# Check organism
147139
species = entry.text.strip()
148-
organ = species.lower().replace(" ", "_")
140+
organ = species.lower().replace(" ", "_")
149141
if "escherichia_coli" in organ and "#1" in organ:
150142
organ = organ[:-2]
151143
if organ in self.organisms:
@@ -154,15 +146,11 @@ def fetch_external(self, force=False):
154146
st_link = entry.find("./mlst/database/profiles/url").text
155147
profiles_query = urllib.request.urlopen(st_link)
156148
profile_no = profiles_query.readlines()[-1].decode("utf-8").split("\t")[0]
157-
if (
158-
organ.replace("_", " ") not in self.updated
159-
and (
160-
int(profile_no.replace("-", "")) > int(currver.replace("-", ""))
161-
or force
162-
)
149+
if organ.replace("_", " ") not in self.updated and (
150+
int(profile_no.replace("-", "")) > int(currver.replace("-", "")) or force
163151
):
164152
# Download MLST profiles
165-
self.logger.info("Downloading new MLST profiles for " + species)
153+
self.logger.info("Downloading new MLST profiles for " + species)
166154
output = "{}/{}".format(self.config["folders"]["profiles"], organ)
167155
urllib.request.urlretrieve(st_link, output)
168156
# Clear existing directory and download allele files
@@ -172,7 +160,9 @@ def fetch_external(self, force=False):
172160
for locus in entry.findall("./mlst/database/loci/locus"):
173161
locus_name = locus.text.strip()
174162
locus_link = locus.find("./url").text
175-
urllib.request.urlretrieve(locus_link, "{}/{}.tfa".format(out, locus_name))
163+
urllib.request.urlretrieve(
164+
locus_link, "{}/{}.tfa".format(out, locus_name)
165+
)
176166
# Create new indexes
177167
self.index_db(out, ".tfa")
178168
# Update database
@@ -183,9 +173,7 @@ def fetch_external(self, force=False):
183173
)
184174
self.db_access.reload_profiletable(organ)
185175
except Exception as e:
186-
self.logger.warning(
187-
"Unable to update pubMLST external data: {}".format(e)
188-
)
176+
self.logger.warning("Unable to update pubMLST external data: {}".format(e))
189177

190178
def resync(self, type="", sample="", ignore=False):
191179
"""Manipulates samples that have an internal ST that differs from pubMLST ST"""
@@ -228,9 +216,7 @@ def fetch_resistances(self, force=False):
228216

229217
for file in os.listdir(hiddensrc):
230218
if file not in actual and (".fsa" in file):
231-
self.logger.info(
232-
"resFinder database files corrupted. Syncing..."
233-
)
219+
self.logger.info("resFinder database files corrupted. Syncing...")
234220
wipeIndex = True
235221
break
236222

@@ -262,12 +248,12 @@ def fetch_resistances(self, force=False):
262248
self.index_db(self.config["folders"]["resistances"], ".fsa")
263249

264250
def existing_organisms(self):
265-
""" Returns list of all organisms currently added """
251+
"""Returns list of all organisms currently added"""
266252
return self.organisms
267253

268254
def organism2reference(self, normal_organism_name):
269255
"""Finds which reference contains the same words as the organism
270-
and returns it in a format for database calls. Returns empty string if none found"""
256+
and returns it in a format for database calls. Returns empty string if none found"""
271257
orgs = os.listdir(self.config["folders"]["references"])
272258
organism = re.split(r"\W+", normal_organism_name.lower())
273259
try:
@@ -296,13 +282,11 @@ def organism2reference(self, normal_organism_name):
296282
)
297283

298284
def download_ncbi(self, reference):
299-
""" Checks available references, downloads from NCBI if not present """
285+
"""Checks available references, downloads from NCBI if not present"""
300286
try:
301287
DEVNULL = open(os.devnull, "wb")
302288
Entrez.email = "2@2.com"
303-
record = Entrez.efetch(
304-
db="nucleotide", id=reference, rettype="fasta", retmod="text"
305-
)
289+
record = Entrez.efetch(db="nucleotide", id=reference, rettype="fasta", retmod="text")
306290
sequence = record.read()
307291
output = "{}/{}.fasta".format(self.config["folders"]["genomes"], reference)
308292
with open(output, "w") as f:
@@ -325,20 +309,16 @@ def download_ncbi(self, reference):
325309
out, err = proc.communicate()
326310
self.logger.info("Downloaded reference {}".format(reference))
327311
except Exception as e:
328-
self.logger.warning(
329-
"Unable to download genome '{}' from NCBI".format(reference)
330-
)
312+
self.logger.warning("Unable to download genome '{}' from NCBI".format(reference))
331313

332314
def add_pubmlst(self, organism):
333-
""" Checks pubmlst for references of given organism and downloads them """
315+
"""Checks pubmlst for references of given organism and downloads them"""
334316
# Organism must be in binomial format and only resolve to one hit
335317
errorg = organism
336318
try:
337319
organism = organism.lower().replace(".", " ")
338320
if organism.replace(" ", "_") in self.organisms and not self.force:
339-
self.logger.info(
340-
"Organism {} already stored in microSALT".format(organism)
341-
)
321+
self.logger.info("Organism {} already stored in microSALT".format(organism))
342322
return
343323
db_query = self.query_pubmlst()
344324

@@ -360,9 +340,7 @@ def add_pubmlst(self, organism):
360340
seqdef_url = subtype["href"]
361341
desc = subtype["description"]
362342
counter += 1.0
363-
self.logger.info(
364-
"Located pubMLST hit {} for sample".format(desc)
365-
)
343+
self.logger.info("Located pubMLST hit {} for sample".format(desc))
366344
if counter > 2.0:
367345
raise Exception(
368346
"Reference '{}' resolved to {} organisms. Please be more stringent".format(
@@ -372,9 +350,7 @@ def add_pubmlst(self, organism):
372350
elif counter < 1.0:
373351
# add external
374352
raise Exception(
375-
"Unable to find requested organism '{}' in pubMLST database".format(
376-
errorg
377-
)
353+
"Unable to find requested organism '{}' in pubMLST database".format(errorg)
378354
)
379355
else:
380356
truename = desc.lower().split(" ")
@@ -387,16 +363,15 @@ def add_pubmlst(self, organism):
387363
self.logger.warning(e.args[0])
388364

389365
def query_pubmlst(self):
390-
""" Returns a json object containing all organisms available via pubmlst.org """
366+
"""Returns a json object containing all organisms available via pubmlst.org"""
391367
db_query = self.client.query_databases()
392368
return db_query
393369

394-
395370
def get_mlst_scheme(self, subtype_href):
396-
""" Returns the path for the MLST data scheme at pubMLST """
371+
"""Returns the path for the MLST data scheme at pubMLST"""
397372
try:
398373
parsed_data = self.client.parse_pubmlst_url(subtype_href)
399-
db = parsed_data.get('db')
374+
db = parsed_data.get("db")
400375
if not db:
401376
self.logger.warning(f"Could not extract database name from URL: {subtype_href}")
402377
return None
@@ -424,49 +399,51 @@ def get_mlst_scheme(self, subtype_href):
424399
self.logger.warning(e)
425400
return None
426401

427-
428402
def external_version(self, organism, subtype_href):
429-
""" Returns the version (date) of the data available on pubMLST """
403+
"""Returns the version (date) of the data available on pubMLST"""
430404
try:
431405
mlst_href = self.get_mlst_scheme(subtype_href)
432406
if not mlst_href:
433407
self.logger.warning(f"MLST scheme not found for URL: {subtype_href}")
434408
return None
435409

436410
parsed_data = self.client.parse_pubmlst_url(mlst_href)
437-
db = parsed_data.get('db')
438-
scheme_id = parsed_data.get('scheme_id')
411+
db = parsed_data.get("db")
412+
scheme_id = parsed_data.get("scheme_id")
439413
if not db or not scheme_id:
440-
self.logger.warning(f"Could not extract database name or scheme ID from MLST URL: {mlst_href}")
414+
self.logger.warning(
415+
f"Could not extract database name or scheme ID from MLST URL: {mlst_href}"
416+
)
441417
return None
442418

443419
scheme_info = self.client.retrieve_scheme_info(db, scheme_id)
444420
last_updated = scheme_info.get("last_updated")
445421
if last_updated:
446-
self.logger.debug(f"Retrieved last_updated: {last_updated} for organism: {organism}")
422+
self.logger.debug(
423+
f"Retrieved last_updated: {last_updated} for organism: {organism}"
424+
)
447425
return last_updated
448426
else:
449-
self.logger.warning(f"No 'last_updated' field found for db: {db}, scheme_id: {scheme_id}")
427+
self.logger.warning(
428+
f"No 'last_updated' field found for db: {db}, scheme_id: {scheme_id}"
429+
)
450430
return None
451431
except Exception as e:
452432
self.logger.warning(f"Could not determine pubMLST version for {organism}")
453433
self.logger.warning(e)
454434
return None
455435

456-
457436
def download_pubmlst(self, organism, subtype_href, force=False):
458-
""" Downloads ST and loci for a given organism stored on pubMLST if it is more recent. Returns update date """
437+
"""Downloads ST and loci for a given organism stored on pubMLST if it is more recent. Returns update date"""
459438
organism = organism.lower().replace(" ", "_")
460439
try:
461440
# Pull version
462441
extver = self.external_version(organism, subtype_href)
463442
currver = self.db_access.get_version(f"profile_{organism}")
464-
if (
465-
int(extver.replace("-", ""))
466-
<= int(currver.replace("-", ""))
467-
and not force
468-
):
469-
self.logger.info(f"Profile for {organism.replace('_', ' ').capitalize()} already at the latest version.")
443+
if int(extver.replace("-", "")) <= int(currver.replace("-", "")) and not force:
444+
self.logger.info(
445+
f"Profile for {organism.replace('_', ' ').capitalize()} already at the latest version."
446+
)
470447
return currver
471448

472449
# Retrieve the MLST scheme URL
@@ -477,18 +454,25 @@ def download_pubmlst(self, organism, subtype_href, force=False):
477454

478455
# Parse the database name and scheme ID
479456
parsed_data = self.client.parse_pubmlst_url(mlst_href)
480-
db = parsed_data.get('db')
481-
scheme_id = parsed_data.get('scheme_id')
457+
db = parsed_data.get("db")
458+
scheme_id = parsed_data.get("scheme_id")
482459
if not db or not scheme_id:
483-
self.logger.warning(f"Could not extract database name or scheme ID from MLST URL: {mlst_href}")
460+
self.logger.warning(
461+
f"Could not extract database name or scheme ID from MLST URL: {mlst_href}"
462+
)
484463
return None
485464

486465
# Step 1: Download the profiles CSV
487466
st_target = f"{self.config['folders']['profiles']}/{organism}"
488467
profiles_csv = self.client.download_profiles_csv(db, scheme_id)
489468
# Only write the first 8 columns, this avoids adding information such as "clonal_complex" and "species"
490469
profiles_csv = profiles_csv.split("\n")
491-
profiles_csv = "\n".join("\t".join([line.split("\t")[:8] for line in profiles_csv]))
470+
trimmed_profiles = []
471+
for line in profiles_csv:
472+
trimmed_profiles.append("\t".join(line.split("\t")[:8]))
473+
474+
profiles_csv = "\n".join(trimmed_profiles)
475+
492476
with open(st_target, "w") as profile_file:
493477
profile_file.write(profiles_csv)
494478
self.logger.info(f"Profiles CSV downloaded to {st_target}")
@@ -518,9 +502,8 @@ def download_pubmlst(self, organism, subtype_href, force=False):
518502
self.logger.error(f"Failed to download data for {organism}: {e}")
519503
return None
520504

521-
522505
def fetch_pubmlst(self, force=False):
523-
""" Updates reference for data that is stored on pubMLST """
506+
"""Updates reference for data that is stored on pubMLST"""
524507
seqdef_url = dict()
525508
db_query = self.query_pubmlst()
526509

0 commit comments

Comments
 (0)