@@ -46,9 +46,8 @@ def __init__(self, config, log, sampleinfo={}, force=False):
4646 self .sample = self .sampleinfo
4747 self .client = PubMLSTClient ()
4848
49-
5049 def identify_new (self , cg_id = "" , project = False ):
51- """ Automatically downloads pubMLST & NCBI organisms not already downloaded """
50+ """Automatically downloads pubMLST & NCBI organisms not already downloaded"""
5251 neworgs = list ()
5352 newrefs = list ()
5453 try :
@@ -91,9 +90,7 @@ def index_db(self, full_dir, suffix):
9190 """Check for indexation, makeblastdb job if not enough of them."""
9291 reindexation = False
9392 files = os .listdir (full_dir )
94- sufx_files = glob .glob (
95- "{}/*{}" .format (full_dir , suffix )
96- ) # List of source files
93+ sufx_files = glob .glob ("{}/*{}" .format (full_dir , suffix )) # List of source files
9794 for file in sufx_files :
9895 subsuf = "\{}$" .format (suffix )
9996 base = re .sub (subsuf , "" , file )
@@ -105,10 +102,7 @@ def index_db(self, full_dir, suffix):
105102 if os .path .basename (base ) == elem [: elem .rfind ("." )]:
106103 bases = bases + 1
107104 # Number of index files fresher than source (6)
108- if (
109- os .stat (file ).st_mtime
110- < os .stat ("{}/{}" .format (full_dir , elem )).st_mtime
111- ):
105+ if os .stat (file ).st_mtime < os .stat ("{}/{}" .format (full_dir , elem )).st_mtime :
112106 newer = newer + 1
113107 # 7 for parse_seqids, 4 for not.
114108 if not (bases == 7 or newer == 6 ) and not (bases == 4 and newer == 3 ):
@@ -121,18 +115,16 @@ def index_db(self, full_dir, suffix):
121115 )
122116 # MLST locis
123117 else :
124- bash_cmd = "makeblastdb -in {}/{} -dbtype nucl -parse_seqids -out {}" .format (
125- full_dir , os .path .basename (file ), os .path .basename (base )
118+ bash_cmd = (
119+ "makeblastdb -in {}/{} -dbtype nucl -parse_seqids -out {}" .format (
120+ full_dir , os .path .basename (file ), os .path .basename (base )
121+ )
126122 )
127- proc = subprocess .Popen (
128- bash_cmd .split (), cwd = full_dir , stdout = subprocess .PIPE
129- )
123+ proc = subprocess .Popen (bash_cmd .split (), cwd = full_dir , stdout = subprocess .PIPE )
130124 proc .communicate ()
131125 except Exception as e :
132126 self .logger .error (
133- "Unable to index requested target {} in {}" .format (
134- file , full_dir
135- )
127+ "Unable to index requested target {} in {}" .format (file , full_dir )
136128 )
137129 if reindexation :
138130 self .logger .info ("Re-indexed contents of {}" .format (full_dir ))
@@ -145,7 +137,7 @@ def fetch_external(self, force=False):
145137 for entry in root :
146138 # Check organism
147139 species = entry .text .strip ()
148- organ = species .lower ().replace (" " , "_" )
140+ organ = species .lower ().replace (" " , "_" )
149141 if "escherichia_coli" in organ and "#1" in organ :
150142 organ = organ [:- 2 ]
151143 if organ in self .organisms :
@@ -154,15 +146,11 @@ def fetch_external(self, force=False):
154146 st_link = entry .find ("./mlst/database/profiles/url" ).text
155147 profiles_query = urllib .request .urlopen (st_link )
156148 profile_no = profiles_query .readlines ()[- 1 ].decode ("utf-8" ).split ("\t " )[0 ]
157- if (
158- organ .replace ("_" , " " ) not in self .updated
159- and (
160- int (profile_no .replace ("-" , "" )) > int (currver .replace ("-" , "" ))
161- or force
162- )
149+ if organ .replace ("_" , " " ) not in self .updated and (
150+ int (profile_no .replace ("-" , "" )) > int (currver .replace ("-" , "" )) or force
163151 ):
164152 # Download MLST profiles
165- self .logger .info ("Downloading new MLST profiles for " + species )
153+ self .logger .info ("Downloading new MLST profiles for " + species )
166154 output = "{}/{}" .format (self .config ["folders" ]["profiles" ], organ )
167155 urllib .request .urlretrieve (st_link , output )
168156 # Clear existing directory and download allele files
@@ -172,7 +160,9 @@ def fetch_external(self, force=False):
172160 for locus in entry .findall ("./mlst/database/loci/locus" ):
173161 locus_name = locus .text .strip ()
174162 locus_link = locus .find ("./url" ).text
175- urllib .request .urlretrieve (locus_link , "{}/{}.tfa" .format (out , locus_name ))
163+ urllib .request .urlretrieve (
164+ locus_link , "{}/{}.tfa" .format (out , locus_name )
165+ )
176166 # Create new indexes
177167 self .index_db (out , ".tfa" )
178168 # Update database
@@ -183,9 +173,7 @@ def fetch_external(self, force=False):
183173 )
184174 self .db_access .reload_profiletable (organ )
185175 except Exception as e :
186- self .logger .warning (
187- "Unable to update pubMLST external data: {}" .format (e )
188- )
176+ self .logger .warning ("Unable to update pubMLST external data: {}" .format (e ))
189177
190178 def resync (self , type = "" , sample = "" , ignore = False ):
191179 """Manipulates samples that have an internal ST that differs from pubMLST ST"""
@@ -228,9 +216,7 @@ def fetch_resistances(self, force=False):
228216
229217 for file in os .listdir (hiddensrc ):
230218 if file not in actual and (".fsa" in file ):
231- self .logger .info (
232- "resFinder database files corrupted. Syncing..."
233- )
219+ self .logger .info ("resFinder database files corrupted. Syncing..." )
234220 wipeIndex = True
235221 break
236222
@@ -262,12 +248,12 @@ def fetch_resistances(self, force=False):
262248 self .index_db (self .config ["folders" ]["resistances" ], ".fsa" )
263249
264250 def existing_organisms (self ):
265- """ Returns list of all organisms currently added """
251+ """Returns list of all organisms currently added"""
266252 return self .organisms
267253
268254 def organism2reference (self , normal_organism_name ):
269255 """Finds which reference contains the same words as the organism
270- and returns it in a format for database calls. Returns empty string if none found"""
256+ and returns it in a format for database calls. Returns empty string if none found"""
271257 orgs = os .listdir (self .config ["folders" ]["references" ])
272258 organism = re .split (r"\W+" , normal_organism_name .lower ())
273259 try :
@@ -296,13 +282,11 @@ def organism2reference(self, normal_organism_name):
296282 )
297283
298284 def download_ncbi (self , reference ):
299- """ Checks available references, downloads from NCBI if not present """
285+ """Checks available references, downloads from NCBI if not present"""
300286 try :
301287 DEVNULL = open (os .devnull , "wb" )
302288 Entrez .email = "2@2.com"
303- record = Entrez .efetch (
304- db = "nucleotide" , id = reference , rettype = "fasta" , retmod = "text"
305- )
289+ record = Entrez .efetch (db = "nucleotide" , id = reference , rettype = "fasta" , retmod = "text" )
306290 sequence = record .read ()
307291 output = "{}/{}.fasta" .format (self .config ["folders" ]["genomes" ], reference )
308292 with open (output , "w" ) as f :
@@ -325,20 +309,16 @@ def download_ncbi(self, reference):
325309 out , err = proc .communicate ()
326310 self .logger .info ("Downloaded reference {}" .format (reference ))
327311 except Exception as e :
328- self .logger .warning (
329- "Unable to download genome '{}' from NCBI" .format (reference )
330- )
312+ self .logger .warning ("Unable to download genome '{}' from NCBI" .format (reference ))
331313
332314 def add_pubmlst (self , organism ):
333- """ Checks pubmlst for references of given organism and downloads them """
315+ """Checks pubmlst for references of given organism and downloads them"""
334316 # Organism must be in binomial format and only resolve to one hit
335317 errorg = organism
336318 try :
337319 organism = organism .lower ().replace ("." , " " )
338320 if organism .replace (" " , "_" ) in self .organisms and not self .force :
339- self .logger .info (
340- "Organism {} already stored in microSALT" .format (organism )
341- )
321+ self .logger .info ("Organism {} already stored in microSALT" .format (organism ))
342322 return
343323 db_query = self .query_pubmlst ()
344324
@@ -360,9 +340,7 @@ def add_pubmlst(self, organism):
360340 seqdef_url = subtype ["href" ]
361341 desc = subtype ["description" ]
362342 counter += 1.0
363- self .logger .info (
364- "Located pubMLST hit {} for sample" .format (desc )
365- )
343+ self .logger .info ("Located pubMLST hit {} for sample" .format (desc ))
366344 if counter > 2.0 :
367345 raise Exception (
368346 "Reference '{}' resolved to {} organisms. Please be more stringent" .format (
@@ -372,9 +350,7 @@ def add_pubmlst(self, organism):
372350 elif counter < 1.0 :
373351 # add external
374352 raise Exception (
375- "Unable to find requested organism '{}' in pubMLST database" .format (
376- errorg
377- )
353+ "Unable to find requested organism '{}' in pubMLST database" .format (errorg )
378354 )
379355 else :
380356 truename = desc .lower ().split (" " )
@@ -387,16 +363,15 @@ def add_pubmlst(self, organism):
387363 self .logger .warning (e .args [0 ])
388364
389365 def query_pubmlst (self ):
390- """ Returns a json object containing all organisms available via pubmlst.org """
366+ """Returns a json object containing all organisms available via pubmlst.org"""
391367 db_query = self .client .query_databases ()
392368 return db_query
393369
394-
395370 def get_mlst_scheme (self , subtype_href ):
396- """ Returns the path for the MLST data scheme at pubMLST """
371+ """Returns the path for the MLST data scheme at pubMLST"""
397372 try :
398373 parsed_data = self .client .parse_pubmlst_url (subtype_href )
399- db = parsed_data .get ('db' )
374+ db = parsed_data .get ("db" )
400375 if not db :
401376 self .logger .warning (f"Could not extract database name from URL: { subtype_href } " )
402377 return None
@@ -424,49 +399,51 @@ def get_mlst_scheme(self, subtype_href):
424399 self .logger .warning (e )
425400 return None
426401
427-
428402 def external_version (self , organism , subtype_href ):
429- """ Returns the version (date) of the data available on pubMLST """
403+ """Returns the version (date) of the data available on pubMLST"""
430404 try :
431405 mlst_href = self .get_mlst_scheme (subtype_href )
432406 if not mlst_href :
433407 self .logger .warning (f"MLST scheme not found for URL: { subtype_href } " )
434408 return None
435409
436410 parsed_data = self .client .parse_pubmlst_url (mlst_href )
437- db = parsed_data .get ('db' )
438- scheme_id = parsed_data .get (' scheme_id' )
411+ db = parsed_data .get ("db" )
412+ scheme_id = parsed_data .get (" scheme_id" )
439413 if not db or not scheme_id :
440- self .logger .warning (f"Could not extract database name or scheme ID from MLST URL: { mlst_href } " )
414+ self .logger .warning (
415+ f"Could not extract database name or scheme ID from MLST URL: { mlst_href } "
416+ )
441417 return None
442418
443419 scheme_info = self .client .retrieve_scheme_info (db , scheme_id )
444420 last_updated = scheme_info .get ("last_updated" )
445421 if last_updated :
446- self .logger .debug (f"Retrieved last_updated: { last_updated } for organism: { organism } " )
422+ self .logger .debug (
423+ f"Retrieved last_updated: { last_updated } for organism: { organism } "
424+ )
447425 return last_updated
448426 else :
449- self .logger .warning (f"No 'last_updated' field found for db: { db } , scheme_id: { scheme_id } " )
427+ self .logger .warning (
428+ f"No 'last_updated' field found for db: { db } , scheme_id: { scheme_id } "
429+ )
450430 return None
451431 except Exception as e :
452432 self .logger .warning (f"Could not determine pubMLST version for { organism } " )
453433 self .logger .warning (e )
454434 return None
455435
456-
457436 def download_pubmlst (self , organism , subtype_href , force = False ):
458- """ Downloads ST and loci for a given organism stored on pubMLST if it is more recent. Returns update date """
437+ """Downloads ST and loci for a given organism stored on pubMLST if it is more recent. Returns update date"""
459438 organism = organism .lower ().replace (" " , "_" )
460439 try :
461440 # Pull version
462441 extver = self .external_version (organism , subtype_href )
463442 currver = self .db_access .get_version (f"profile_{ organism } " )
464- if (
465- int (extver .replace ("-" , "" ))
466- <= int (currver .replace ("-" , "" ))
467- and not force
468- ):
469- self .logger .info (f"Profile for { organism .replace ('_' , ' ' ).capitalize ()} already at the latest version." )
443+ if int (extver .replace ("-" , "" )) <= int (currver .replace ("-" , "" )) and not force :
444+ self .logger .info (
445+ f"Profile for { organism .replace ('_' , ' ' ).capitalize ()} already at the latest version."
446+ )
470447 return currver
471448
472449 # Retrieve the MLST scheme URL
@@ -477,18 +454,25 @@ def download_pubmlst(self, organism, subtype_href, force=False):
477454
478455 # Parse the database name and scheme ID
479456 parsed_data = self .client .parse_pubmlst_url (mlst_href )
480- db = parsed_data .get ('db' )
481- scheme_id = parsed_data .get (' scheme_id' )
457+ db = parsed_data .get ("db" )
458+ scheme_id = parsed_data .get (" scheme_id" )
482459 if not db or not scheme_id :
483- self .logger .warning (f"Could not extract database name or scheme ID from MLST URL: { mlst_href } " )
460+ self .logger .warning (
461+ f"Could not extract database name or scheme ID from MLST URL: { mlst_href } "
462+ )
484463 return None
485464
486465 # Step 1: Download the profiles CSV
487466 st_target = f"{ self .config ['folders' ]['profiles' ]} /{ organism } "
488467 profiles_csv = self .client .download_profiles_csv (db , scheme_id )
489468 # Only write the first 8 columns, this avoids adding information such as "clonal_complex" and "species"
490469 profiles_csv = profiles_csv .split ("\n " )
491- profiles_csv = "\n " .join ("\t " .join ([line .split ("\t " )[:8 ] for line in profiles_csv ]))
470+ trimmed_profiles = []
471+ for line in profiles_csv :
472+ trimmed_profiles .append ("\t " .join (line .split ("\t " )[:8 ]))
473+
474+ profiles_csv = "\n " .join (trimmed_profiles )
475+
492476 with open (st_target , "w" ) as profile_file :
493477 profile_file .write (profiles_csv )
494478 self .logger .info (f"Profiles CSV downloaded to { st_target } " )
@@ -518,9 +502,8 @@ def download_pubmlst(self, organism, subtype_href, force=False):
518502 self .logger .error (f"Failed to download data for { organism } : { e } " )
519503 return None
520504
521-
522505 def fetch_pubmlst (self , force = False ):
523- """ Updates reference for data that is stored on pubMLST """
506+ """Updates reference for data that is stored on pubMLST"""
524507 seqdef_url = dict ()
525508 db_query = self .query_pubmlst ()
526509
0 commit comments