2626import sys
2727import json
2828import time
29- import hashlib
3029import requests
3130import itertools
3231import multiprocessing
@@ -250,19 +249,49 @@ def quality_control(locus_input):
250249 - A list with the identifiers of invalid alleles
251250 (list of str).
252251 """
253- res = sm .get_seqs_dicts (* locus_input )
254-
255- prots_file = '{0}_prots' .format (locus_input [0 ].split ('.fasta' )[0 ])
256- fo .pickle_dumper (res [1 ], prots_file )
257-
258- if len (res [2 ]) > 0 :
259- print (' Found {0} invalid alleles for '
260- 'locus {1}.' .format (len (res [2 ]), locus_input [1 ]))
252+ fasta_path , locus_id , output_directories , table_id , min_len , size_threshold = locus_input
253+
254+ # Import DNA sequences
255+ dna_seqs = fao .import_sequences (fasta_path )
256+ # Translate sequences
257+ _ , protein_file , _ , invalid = fao .translate_fasta (fasta_path , output_directories [1 ], table_id )
258+ prot_seqs = fao .import_sequences (protein_file )
259+
260+ if size_threshold is not None and len (prot_seqs ) > 0 :
261+ # Remove alleles based on length mode and size threshold
262+ modes , alm , asm , allele_sizes = sm .mode_filter (dna_seqs , size_threshold )
263+ excluded = set (asm + alm )
264+ # Remove excluded alleles from dictionaries
265+ dna_seqs = {seqid : seq
266+ for seqid , seq in dna_seqs .items ()
267+ if seqid not in excluded }
268+ prot_seqs = {seqid : seq
269+ for seqid , seq in prot_seqs .items ()
270+ if seqid not in excluded }
271+
272+ modes_concat = ':' .join (map (str , modes ))
273+ st_percentage = int (size_threshold * 100 )
274+ invalid += [[s , ct .ALM_MSG .format (st_percentage , allele_sizes [s ], modes_concat )] for s in alm ]
275+ invalid += [[s , ct .ASM_MSG .format (st_percentage , allele_sizes [s ], modes_concat )] for s in asm ]
276+
277+ # Rewrite FASTA files with valid sequences only
278+ if len (invalid ) > 0 :
279+ # Create FASTA file with valid DNA sequences
280+ valid_fasta = os .path .join (output_directories [0 ], '{0}.fasta' .format (locus_id ))
281+ dna_records = fao .fasta_lines (ct .FASTA_RECORD_TEMPLATE , [(seqid , seq ) for seqid , seq in dna_seqs .items ()])
282+ fo .write_lines (dna_records , valid_fasta )
283+ # Create FASTA file with valid protein sequences
284+ valid_prot_file = protein_file
285+ prot_records = fao .fasta_lines (ct .FASTA_RECORD_TEMPLATE , [(seqid , seq ) for seqid , seq in prot_seqs .items ()])
286+ fo .write_lines (prot_records , valid_prot_file )
287+ else :
288+ valid_fasta = fasta_path
289+ valid_prot_file = protein_file
261290
262- return [locus_input [ 0 ], prots_file , res [ 2 ] ]
291+ return [valid_fasta , valid_prot_file , invalid ]
263292
264293
265- def create_lengths_files (loci_files , out_dir ):
294+ def create_lengths_files (loci_files , output_directory ):
266295 """Determine the length of the sequences in a set of FASTA files.
267296
268297 Parameters
@@ -287,7 +316,7 @@ def create_lengths_files(loci_files, out_dir):
287316 for file in loci_files :
288317 locus_basename = fo .file_basename (file )
289318 locus_lengths = {locus_basename : fao .sequence_lengths (file , True )}
290- lengths_file = os .path .join (out_dir ,
319+ lengths_file = os .path .join (output_directory ,
291320 '{0}_lengths' .format (locus_basename .split ('.fasta' )[0 ]))
292321
293322 fo .pickle_dumper (locus_lengths , lengths_file )
@@ -340,7 +369,7 @@ def schema_completedness(base_url, species_id, schema_id, headers_get,
340369 the species and linked to the schema, respectively. The
341370 last element is the BLAKE2 hash of the locus file.
342371 """
343- # get info about loci and alleles upload
372+ # Get info about loci and alleles upload
344373 schema_loci = cr .simple_get_request (base_url , headers_get ,
345374 ['species' , species_id ,
346375 'schemas' , schema_id ,
@@ -352,10 +381,10 @@ def schema_completedness(base_url, species_id, schema_id, headers_get,
352381 schema_loci = schema_loci ['hashes' ]
353382
354383 loci_info = {hashed_files [k ]: v [1 ]+ [k ] for k , v in schema_loci .items ()}
355- # determine loci that were not fully inserted
384+ # Determine loci that were not fully inserted
356385 absent_loci = {hashed_files [k ]: v [1 ]+ [k ]
357386 for k , v in schema_loci .items () if all (v [1 ]) is not True }
358- # determine loci without alleles
387+ # Determine loci without alleles
359388 absent_alleles = [hashed_files [k ]
360389 for k , v in schema_loci .items () if v [0 ] is False ]
361390
@@ -431,7 +460,7 @@ def create_schema(base_url, headers_post, species_id, params):
431460 return [schema_url , schema_id ]
432461
433462
434- def create_loci_file (schema_files , annotations , schema_dir ,
463+ def create_loci_file (schema_files , output_directory , annotations , schema_dir ,
435464 species_id , schema_id , loci_prefix ,
436465 absent_loci = None ):
437466 """Create a file with the essential data to insert loci in Chewie-NS.
@@ -462,14 +491,14 @@ def create_loci_file(schema_files, annotations, schema_dir,
462491 Path to the file with the necessary data to insert loci
463492 and associate with species and schema.
464493 """
465- loci_file = os .path .join (schema_dir ,
494+ loci_file = os .path .join (output_directory ,
466495 '{0}_{1}_loci' .format (species_id , schema_id ))
467496 loci_data = [loci_prefix , []]
468497 for file in schema_files :
469498 if absent_loci is None or file in absent_loci :
470499
471500 locus = os .path .basename (file ).split ('.fasta' )[0 ]
472- locus_hash = fo .hash_file (file , hashlib . blake2b () )
501+ locus_hash = fo .hash_file (file , ' blake2b' )
473502
474503 locus_annotations = annotations [locus ]
475504
@@ -521,15 +550,15 @@ def upload_loci_data(loci_file, base_url, species_id,
521550 - If the response returned from the NS indicates that it
522551 was not possible to create and associate any/some loci.
523552 """
524- # compress file with loci data to reduce upload size
553+ # Compress file with loci data to reduce upload size
525554 loci_zip_file = '{0}.zip' .format (loci_file )
526555 fo .file_zipper (loci_file , loci_zip_file )
527556
528557 zip_url = cr .make_url (base_url , 'species' , species_id ,
529558 'schemas' , schema_id , 'loci' ,
530559 'data' )
531560
532- # upload file as multipart-encoded file
561+ # Upload file as multipart-encoded file
533562 filename = '{0}_{1}_loci.zip' .format (species_id , schema_id )
534563 response = cr .upload_file (loci_zip_file , filename ,
535564 zip_url , headers_post ,
@@ -573,7 +602,7 @@ def upload_loci_data(loci_file, base_url, species_id,
573602 return response_data
574603
575604
576- def create_alleles_files (schema_files , loci_responses , invalid_alleles ,
605+ def create_alleles_files (schema_files , output_directory , loci_responses , invalid_alleles ,
577606 species_name , base_url , species_id ,
578607 schema_id , user_id ):
579608 """Create files with data to insert alleles in Chewie-NS.
@@ -634,7 +663,7 @@ def create_alleles_files(schema_files, loci_responses, invalid_alleles,
634663 locus_name = os .path .basename (file ).split ('.fasta' )[0 ]
635664 loci_names .append (locus_name )
636665
637- alleles_file = os .path .join (schema_dir ,
666+ alleles_file = os .path .join (output_directory ,
638667 '{0}_{1}_{2}' .format (species_id ,
639668 schema_id ,
640669 locus_id ))
@@ -882,26 +911,30 @@ def main(schema_directory, species_id, schema_name, loci_prefix,
882911 print ('Schema exists and is incomplete '
883912 '("{0}", id={1})' .format (schema_name , schema_id ))
884913
914+ # Create folder to store temporary files
915+ upload_temp_dir = os .path .join (schema_directory , 'upload_temp' )
916+ fo .create_directory (upload_temp_dir )
917+
885918 # Get schema description
886919 if continue_up is False :
887920 if description_file is not None and os .path .isfile (description_file ) is True :
888921 # Determine file hash
889- description_hash = fo .hash_file (description_file , hashlib . blake2b () )
922+ description_hash = fo .hash_file (description_file , ' blake2b' )
890923 print ('Schema description: {0}' .format (description_file ))
891924 else :
892925 print ('Could not get a description from a file. '
893926 'Will use the name of the schema as description.' )
894- description_file = 'schema_description.txt'
927+ description_file = fo . join_paths ( upload_temp_dir , [ 'schema_description.txt' ])
895928 with open (description_file , 'w' ) as sd :
896929 sd .write (schema_name )
897- description_hash = fo .hash_file (description_file , hashlib . blake2b () )
930+ description_hash = fo .hash_file (description_file , ' blake2b' )
898931
899932 params ['SchemaDescription' ] = description_hash
900933
901934 print ('\n -- Schema Pre-processing --' )
902935
903936 # Hash schema files to get unique identifiers based on content
904- hashed_files = {fo .hash_file (file , hashlib . blake2b () ): file for file in fasta_paths }
937+ hashed_files = {fo .hash_file (file , ' blake2b' ): file for file in fasta_paths }
905938 print ('Determining data to upload...' )
906939 absent_loci = fasta_paths
907940 if upload_type [0 ] == 'incomplete' :
@@ -914,9 +947,16 @@ def main(schema_directory, species_id, schema_name, loci_prefix,
914947 print (' Loci without the full set of alleles: '
915948 '{0}\n ' .format (len (fasta_paths )))
916949
950+ # Create temporary directory to store FASTA files containing only the DNA dna protein sequences for the valid alleles
951+ valid_fasta_dir = os .path .join (upload_temp_dir , 'valid_fasta' )
952+ fo .create_directory (valid_fasta_dir )
953+ translated_dir = os .path .join (upload_temp_dir , 'translated_fasta' )
954+ fo .create_directory (translated_dir )
955+
917956 # Create inputs for QC step
918957 inputs = [(file ,
919958 file .split ('/' )[- 1 ].split ('.fasta' )[0 ],
959+ [valid_fasta_dir , translated_dir ],
920960 int (params ['translation_table' ]),
921961 0 ,
922962 None ) for file in fasta_paths ]
@@ -941,8 +981,8 @@ def main(schema_directory, species_id, schema_name, loci_prefix,
941981 prot_files = [r [1 ] for r in qc_results ]
942982
943983 # Determine loci missing annotations
944- miss_annotation = [os .path .basename (pf .split ('_prots ' )[0 ]) for pf in prot_files
945- if pf .split ('_prots ' )[0 ] + '.fasta' in absent_loci ]
984+ miss_annotation = [os .path .basename (pf .split ('_protein ' )[0 ]) for pf in prot_files
985+ if pf .split ('_protein ' )[0 ] + '.fasta' in absent_loci ]
946986
947987 print ('Missing annotations for {0} loci' .format (len (miss_annotation )))
948988
@@ -969,8 +1009,7 @@ def main(schema_directory, species_id, schema_name, loci_prefix,
9691009 loci_annotations [l ].extend (['N/A' , 'N/A' , 'N/A' , 'N/A' , 'N/A' ])
9701010 print ('\n User provided valid annotations for {0} loci.' .format (provided ))
9711011
972- # convert parameters to string type because Chewie-NS
973- # expects strings
1012+ # Convert parameters to string type because Chewie-NS expects strings
9741013 params = {k : str (v ) for k , v in params .items ()}
9751014 params ['schema_hashes' ] = list (hashed_files .keys ())
9761015
@@ -982,7 +1021,7 @@ def main(schema_directory, species_id, schema_name, loci_prefix,
9821021 headers_post ,
9831022 species_id ,
9841023 params )
985- # send file with description
1024+ # Send file with description
9861025 description_uri = cr .make_url (nomenclature_server , 'species' ,
9871026 species_id , 'schemas' , schema_id ,
9881027 'description' )
@@ -997,7 +1036,7 @@ def main(schema_directory, species_id, schema_name, loci_prefix,
9971036 print ('Continuing upload of schema with '
9981037 'name {0} (id={1})\n ' .format (schema_name , schema_id ))
9991038
1000- # start creating new loci and adding/linking alleles
1039+ # Start creating new loci and adding/linking alleles
10011040 print ('Loci data:' )
10021041
10031042 if upload_type [0 ] == 'incomplete' :
@@ -1006,8 +1045,9 @@ def main(schema_directory, species_id, schema_name, loci_prefix,
10061045 for k , v in loci_info .items ()}
10071046 print ('All loci were inserted in a previous process.\n ' )
10081047 else :
1048+ loci_data_dir = os .path .join (upload_temp_dir , 'loci_data' )
10091049 print (' Collecting loci data...' )
1010- loci_file = create_loci_file (dna_files , loci_annotations ,
1050+ loci_file = create_loci_file (dna_files , loci_data_dir , loci_annotations ,
10111051 schema_directory , species_id ,
10121052 schema_id , loci_prefix ,
10131053 absent_loci )
@@ -1022,8 +1062,11 @@ def main(schema_directory, species_id, schema_name, loci_prefix,
10221062 print ('\n The NS completed the insertion of {0} '
10231063 'loci.\n ' .format (len (absent_loci )))
10241064 else :
1065+ # Create directory to store loci data files
1066+ loci_data_dir = os .path .join (upload_temp_dir , 'loci_data' )
1067+ fo .create_directory (loci_data_dir )
10251068 print (' Collecting loci data...' )
1026- loci_file = create_loci_file (dna_files , loci_annotations ,
1069+ loci_file = create_loci_file (dna_files , loci_data_dir , loci_annotations ,
10271070 schema_directory , species_id ,
10281071 schema_id , loci_prefix )
10291072 print (' Sending data to chewie-NS...' )
@@ -1035,18 +1078,20 @@ def main(schema_directory, species_id, schema_name, loci_prefix,
10351078 print ('\n The NS completed the insertion of {0} '
10361079 'loci.\n ' .format (len (response_data )))
10371080
1038- # create files with info for posting alleles
1081+ # Create files with info for posting alleles
1082+ allele_data_dir = os .path .join (upload_temp_dir , 'allele_data' )
1083+ fo .create_directory (allele_data_dir )
10391084 print ('Alleles data:' )
10401085 print (' Collecting alleles data...' )
10411086 (alleles_files , loci_ids , loci_hashes ,
1042- loci_names ) = create_alleles_files (dna_files , response_data ,
1087+ loci_names ) = create_alleles_files (dna_files , allele_data_dir , response_data ,
10431088 invalid_identifiers , species_name ,
10441089 nomenclature_server , species_id ,
10451090 schema_id , user_id )
1046- # determine length of all alleles per locus
1047- length_files = create_lengths_files (dna_files , schema_directory )
1091+ # Determine length of all alleles per locus
1092+ length_files = create_lengths_files (dna_files , allele_data_dir )
10481093
1049- # zip all files to reduce upload size
1094+ # Compress all files to reduce upload size
10501095 print (' Compressing files with alleles data...' )
10511096 zipped_files = ['{0}.zip' .format (file ) for file in alleles_files ]
10521097 list (map (fo .file_zipper , alleles_files , zipped_files ))
@@ -1065,7 +1110,7 @@ def main(schema_directory, species_id, schema_name, loci_prefix,
10651110 'contact the NS Admin if the problem '
10661111 'persists.' .format (',' .join (failed )))
10671112 else :
1068- # send training file to NS
1113+ # Upload training file to Chewie- NS
10691114 print ('\n \n Uploading Prodigal training file...' )
10701115 ptf_url = cr .make_url (nomenclature_server , 'species' , species_id ,
10711116 'schemas' , schema_id , 'ptf' )
@@ -1080,13 +1125,6 @@ def main(schema_directory, species_id, schema_name, loci_prefix,
10801125 print ('Schema information will also be available on the '
10811126 'chewie-NS website.\n ' )
10821127
1083- # delete all intermediate files
1128+ # Delete all intermediate files
10841129 print ('Removing intermediate files...' )
1085- fo .remove_files (prot_files )
1086- fo .remove_files (length_files )
1087- fo .remove_files (alleles_files )
1088- fo .remove_files (zipped_files )
1089-
1090- if len (absent_loci ) > 0 :
1091- os .remove (loci_file )
1092- os .remove ('{0}.zip' .format (loci_file ))
1130+ fo .delete_directory (upload_temp_dir )
0 commit comments