33r"""Alignment module for clustered scaffolds.
44
55 ██████████ ███████████ █████ █████
6- ░░███░░░░███ ░█░░░███░░░█░░███ ░░███
7- ░███ ░░███░ ░███ ░ ░███ ░███
8- ░███ ░███ ░███ ░███ ░███
9- ░███ ░███ ░███ ░███ ░███
10- ░███ ███ ░███ ░███ ░███
11- ██████████ █████ ░░████████
12- ░░░░░░░░░░ ░░░░░ ░░░░░░░░
13-
6+ ░░███░░░░███ ░█░░░███░░░█░░███ ░░███
7+ ░███ ░░███░ ░███ ░ ░███ ░███
8+ ░███ ░███ ░███ ░███ ░███
9+ ░███ ░███ ░███ ░███ ░███
10+ ░███ ███ ░███ ░███ ░███
11+ ██████████ █████ ░░████████
12+ ░░░░░░░░░░ ░░░░░ ░░░░░░░░
13+
1414__authors__ = Marco Reverenna
1515__copyright__ = Copyright 2025-2026
1616__research-group__ = DTU Biosustain (Multi-omics Network Analytics) and DTU Bioengineering
2828from pathlib import Path
2929from Bio import SeqIO
3030
31- logging .basicConfig (level = logging .INFO , format = "%(asctime)s [%(levelname)s] %(message)s" )
31+ logging .basicConfig (
32+ level = logging .INFO , format = "%(asctime)s [%(levelname)s] %(message)s"
33+ )
3234logger = logging .getLogger (__name__ )
3335
3436
@@ -49,16 +51,29 @@ def align_or_copy_fasta(fasta_file, output_file):
4951 logger .debug (f"Copied single-sequence file: { Path (fasta_file ).name } " )
5052 elif len (sequences ) > 1 :
5153 # Multiple sequences, run clustalo
52- logger .debug (f"Aligning { len (sequences )} sequences from { Path (fasta_file ).name } ..." )
54+ logger .debug (
55+ f"Aligning { len (sequences )} sequences from { Path (fasta_file ).name } ..."
56+ )
5357 try :
5458 subprocess .run (
55- ["clustalo" , "-i" , fasta_file , "-o" , output_file , "--outfmt" , "fa" , "--force" ],
59+ [
60+ "clustalo" ,
61+ "-i" ,
62+ fasta_file ,
63+ "-o" ,
64+ output_file ,
65+ "--outfmt" ,
66+ "fa" ,
67+ "--force" ,
68+ ],
5669 check = True ,
57- capture_output = True , # Suppress clustalo stdout
58- text = True
70+ capture_output = True , # Suppress clustalo stdout
71+ text = True ,
5972 )
6073 except FileNotFoundError :
61- logger .error ("clustalo command not found. Please ensure it is in your system's PATH." )
74+ logger .error (
75+ "clustalo command not found. Please ensure it is in your system's PATH."
76+ )
6277 raise
6378 except subprocess .CalledProcessError as e :
6479 logger .error (f"Clustalo failed for { fasta_file } : { e .stderr } " )
@@ -71,39 +86,39 @@ def align_or_copy_fasta(fasta_file, output_file):
7186def process_alignment (input_dir : str , output_dir : str ):
7287 """
7388 Align all FASTA files from input_dir and save results in output_dir.
74-
89+
7590 Args:
7691 input_dir (str): Path to the .../cluster_fasta/ directory.
7792 output_dir (str): Path to the .../alignment/ directory.
7893 """
7994 cluster_fasta_folder = Path (input_dir )
8095 alignment_folder = Path (output_dir )
81-
96+
8297 alignment_folder .mkdir (parents = True , exist_ok = True )
8398
8499 if not cluster_fasta_folder .exists ():
85100 logger .error (f"Cluster FASTA folder not found: { cluster_fasta_folder } " )
86- raise FileNotFoundError (f"Cluster FASTA folder not found: { cluster_fasta_folder } " )
101+ raise FileNotFoundError (
102+ f"Cluster FASTA folder not found: { cluster_fasta_folder } "
103+ )
87104
88105 fasta_files_to_align = [
89- f for f in sorted (os .listdir (cluster_fasta_folder ))
90- if f .endswith (".fasta" )
106+ f for f in sorted (os .listdir (cluster_fasta_folder )) if f .endswith (".fasta" )
91107 ]
92108
93109 logger .info (f"Found { len (fasta_files_to_align )} cluster FASTA files to align." )
94110
95111 for fasta_file in fasta_files_to_align :
96112 fasta_path = cluster_fasta_folder / fasta_file
97113 # Save output as .afa (alignment FASTA)
98- output_path = alignment_folder / fasta_file .replace (".fasta" , "_aligned.afa" )
99-
114+ output_path = alignment_folder / fasta_file .replace (".fasta" , "_aligned.afa" )
115+
100116 align_or_copy_fasta (fasta_path , output_path )
101117
102118 logger .info ("All alignment tasks completed." )
103119
104120
105- def main (input_cluster_fasta_folder : str ,
106- output_alignment_folder : str ):
121+ def main (input_cluster_fasta_folder : str , output_alignment_folder : str ):
107122 """
108123 Main function to run the alignment script.
109124 """
@@ -113,10 +128,9 @@ def main(input_cluster_fasta_folder: str,
113128 logger .info (f"Output Folder (Alignments): { output_alignment_folder } " )
114129
115130 process_alignment (
116- input_dir = input_cluster_fasta_folder ,
117- output_dir = output_alignment_folder
131+ input_dir = input_cluster_fasta_folder , output_dir = output_alignment_folder
118132 )
119-
133+
120134 logger .info ("--- Step 4: Alignment Completed ---" )
121135
122136
@@ -127,30 +141,31 @@ def cli():
127141 parser = argparse .ArgumentParser (
128142 description = "Alignment script for clustered scaffolds."
129143 )
130-
144+
131145 parser .add_argument (
132146 "--input-folder" ,
133147 type = str ,
134148 required = True ,
135- help = "Path to the folder containing cluster FASTA files (e.g., .../cluster_fasta)."
149+ help = "Path to the folder containing cluster FASTA files (e.g., .../cluster_fasta)." ,
136150 )
137151 parser .add_argument (
138152 "--output-folder" ,
139153 type = str ,
140154 required = True ,
141- help = "Path to the folder to save aligned .afa files (e.g., .../alignment)."
155+ help = "Path to the folder to save aligned .afa files (e.g., .../alignment)." ,
142156 )
143-
157+
144158 args = parser .parse_args ()
145159
146- main (input_cluster_fasta_folder = args .input_folder ,
147- output_alignment_folder = args .output_folder
148- )
160+ main (
161+ input_cluster_fasta_folder = args .input_folder ,
162+ output_alignment_folder = args .output_folder ,
163+ )
149164
150165
151166if __name__ == "__main__" :
152167 cli ()
153168
154169# python -m instanexus.alignment \
155170# --input-folder outputs/bsa/scaffolds/clustering/cluster_fasta \
156- # --output-folder outputs/bsa/scaffolds/alignment
171+ # --output-folder outputs/bsa/scaffolds/alignment
0 commit comments