1414__authors__ = Marco Reverenna
1515__copyright__ = Copyright 2025-2026
1616__research-group__ = DTU Biosustain (Multi-omics Network Analytics) and DTU Bioengineering
17- __date__ = 29 Oct 2025
17+ __date__ = 13 Nov 2025
1818__maintainer__ = Marco Reverenna
19192020__status__ = Dev
3434from Bio import SeqIO
3535
3636
37- PROJECT_ROOT = Path (__file__ ).resolve ().parents [2 ]
38- JSON_DIR = PROJECT_ROOT / "json"
37+ # PROJECT_ROOT = Path(__file__).resolve().parents[2]
38+ # JSON_DIR = PROJECT_ROOT / "json"
3939
40- def get_sample_metadata (run , chain = "" , json_path = JSON_DIR / "sample_metadata.json" ):
40+ def get_sample_metadata (run , chain = "" , json_path = None ):
41+ """Retrieve sample metadata from a JSON file based on the run and optional chain."""
42+ if json_path is None :
43+ raise ValueError ("json_path must be provided." )
44+
4145 with open (json_path , "r" ) as f :
4246 all_meta = json .load (f )
4347
@@ -57,29 +61,6 @@ def get_sample_metadata(run, chain="", json_path=JSON_DIR / "sample_metadata.jso
5761 raise ValueError (f"No metadata found for run '{ run } ' with chain '{ chain } '." )
5862
5963
60- # Define and create the necessary directories only if they don't exist
61- def create_directory (path ):
62- """Creates a directory if it does not already exist.
63- Args:
64- path (str): The path of the directory to create.
65- """
66- if not os .path .exists (path ):
67- os .makedirs (path )
68- # print(f"Created: {path}")
69- # else:
70- # print(f"Already exists: {path}")
71-
72-
73- def create_subdirectories_outputs (folder ):
74- """Creates subdirectories within the specified folder.
75- Args:
76- folder (str): The path of the parent directory.
77- """
78- subdirectories = ["cleaned" , "contigs" , "scaffolds" , "statistics" ]
79- for subdirectory in subdirectories :
80- create_directory (f"{ folder } /{ subdirectory } " )
81-
82-
8364def normalize_sequence (sequence ):
8465 """Normalize the given amino acid sequence by replacing all occurrences of 'I' with
8566 'L'.
@@ -130,24 +111,6 @@ def remove_modifications(psm_column):
130111 return None
131112
132113
133- # ! needs to move once it is a package
134- def test_remove_modifications ():
135- assert remove_modifications ("A(ox)BC(mod)D" ) == "ABCD"
136- assert remove_modifications ("A[UNIMOD:21]BC[UNIMOD:35]D" ) == "ABCD"
137- assert (
138- remove_modifications ("A(ox)[UNIMOD:21]BC(mod)[UNIMOD:35]D" ) == "ABCD"
139- )
140- assert remove_modifications (None ) is None
141- assert remove_modifications ("ACD" ) == "ACD"
142- assert remove_modifications ("A(I)BCD" ) == "ABCD"
143- assert remove_modifications ("A(ox)B(I)C(mod)D" ) == "ABCD"
144- assert (
145- remove_modifications ("A(ox)[UNIMOD:21]B(I)C(mod)[UNIMOD:35]D" ) == "ABCD"
146- )
147- assert remove_modifications ("AI BCD" ) == "AL BCD"
148- assert remove_modifications ("A(ox)I B(mod)CD" ) == "AL BCD"
149-
150-
151114def clean_dataframe (df ):
152115 """Clean and preprocess a DataFrame for analysis by removing '(ox)' substrings
153116 from sequences in the 'seq' column. By replacing values of -1 with -10 in the
@@ -214,64 +177,65 @@ def filter_contaminants(seqs, run, contaminants_fasta):
214177
215178def main (
216179 input_csv : str ,
217- chain : str = "" ,
218- folder_outputs : str = "outputs" ,
219- reference : bool = False ,
220- assembly_mode = "dbg" ,
221- conf : float = 0.88 ,
222- kmer_size : int = 6 ,
223- size_threshold : int = 10 ,
224- min_overlap : int = 4 ,
225- min_identity : float = 0.8 ,
226- max_mismatches : int = 14 ,
180+ metadata_json : str ,
181+ contaminants_fasta : str ,
182+ chain : str ,
183+ #folder_outputs: str,
184+ reference : bool ,
185+ #assembly_mode: str,
186+ conf : float ,
187+ output_csv_path : str ,
188+ #kmer_size: int,
189+ #size_threshold: int,
190+ #min_overlap: int,
191+ #min_identity: float,
192+ #max_mismatches: int,
227193):
228194 """Main function to run the preprocessing script."""
229195 input_csv = Path (input_csv )
230196
231197 print ("Starting preprocessing pipeline." )
232198
233- run = input_csv . stem
234- repo_folder = Path ( __file__ ). resolve (). parents [ 2 ]
199+ input_csv = Path ( input_csv )
200+ run = input_csv . stem # stem gives the filename without suffix
235201
236202 # load metadata
237203 if chain :
238- meta = get_sample_metadata (run , chain = chain )
204+ meta = get_sample_metadata (run , chain = chain , json_path = metadata_json )
239205 else :
240- meta = get_sample_metadata (run )
206+ meta = get_sample_metadata (run , json_path = metadata_json )
241207
242208 proteases = meta ["proteases" ]
243209
244210 if reference :
245211 protein = meta ["protein" ]
246212
247- if assembly_mode != "dbg" :
248- print ("Ignoring kmer_size (only relevant for dbg mode)" )
249- kmer_size = None
213+ # if assembly_mode != "dbg":
214+ # print("Ignoring kmer_size (only relevant for dbg mode)")
215+ # kmer_size = None
250216
251- if not reference :
252- print ("Ignoring min_identity and max_mismatches (only relevant when reference=True)" )
253- min_identity = None
254- max_mismatches = None
217+ # if not reference:
218+ # print("Ignoring min_identity and max_mismatches (only relevant when reference=True)")
219+ # min_identity = None
220+ # max_mismatches = None
255221
256222 print ("Parameters loaded." )
257223
258- folder_outputs = Path (folder_outputs ) / run
259- folder_outputs .mkdir (parents = True , exist_ok = True )
260-
261- folder_name_parts = [f"comb_{ assembly_mode } " , f"c{ conf } " , f"ts{ size_threshold } " , f"mo{ min_overlap } " ]
224+ #folder_outputs = Path(folder_outputs) / run
225+ #folder_outputs.mkdir(parents=True, exist_ok=True)
262226
263- if assembly_mode == "dbg" :
264- folder_name_parts .insert (2 , f"ks{ kmer_size } " )
227+ #folder_name_parts = [f"comb_{assembly_mode}", f"c{conf}", f"ts{size_threshold}", f"mo{min_overlap}"]
265228
266- if reference :
267- folder_name_parts .extend ([ f"mi { min_identity } " , f"mm { max_mismatches } " ] )
229+ # if assembly_mode == "dbg" :
230+ # folder_name_parts.insert(2 , f"ks{kmer_size}" )
268231
269- combination_folder_out = folder_outputs / "_" . join ( folder_name_parts )
270- create_subdirectories_outputs ( combination_folder_out )
232+ #if reference:
233+ # folder_name_parts.extend([f"mi{min_identity}", f"mm{max_mismatches}"] )
271234
272- print (f"Output folders created at: { combination_folder_out } " )
235+ # combination_folder_out = folder_outputs / "_".join(folder_name_parts)
236+ # create_subdirectories_outputs(combination_folder_out)
273237
274- # data cleaning
238+ # print(f"Output folders created at: {combination_folder_out}")
275239
276240 logger .info ("Starting data cleaning..." )
277241
@@ -290,7 +254,7 @@ def main(
290254 cleaned_psms = df ["cleaned_preds" ].tolist ()
291255
292256 filtered_psms = filter_contaminants (
293- cleaned_psms , run , repo_folder / "fasta/contaminants.fasta"
257+ cleaned_psms , run , contaminants_fasta
294258 )
295259 df = df [df ["cleaned_preds" ].isin (filtered_psms )]
296260
@@ -300,16 +264,21 @@ def main(
300264 )
301265
302266 # probably confidence trhreshold won't be necessary anymore
303- df = df [df ["conf" ] > conf ]
267+ if conf is not None :
268+ logger .info (f"Applying confidence threshold: { conf } " )
269+ df = df [df ["conf" ] > conf ]
270+ else :
271+ logger .info ("No confidence threshold applied." )
304272
305273 df .reset_index (drop = True , inplace = True )
306274
307275 logger .info ("Data cleaning completed." )
276+ cleaned_csv_path = Path (output_csv_path )
277+ cleaned_csv_path .parent .mkdir (parents = True , exist_ok = True )
308278
309- cleaned_csv_path = combination_folder_out / "cleaned" / "cleaned_data.csv"
279+ # cleaned_csv_path = combination_folder_out / "cleaned" / "cleaned_data.csv"
310280
311281 df .to_csv (cleaned_csv_path , index = False )
312-
313282 logger .info ("Cleaned data saved to: {}." .format (cleaned_csv_path ))
314283
315284
@@ -332,80 +301,84 @@ def cli():
332301 default = "" ,
333302 help = "Chain identifier for the sample (optional)." ,
334303 )
335- parser .add_argument (
336- "--folder-outputs" ,
337- type = str ,
338- default = "outputs" ,
339- help = "Folder to save output files." ,
340- )
304+ # parser.add_argument(
305+ # "--folder-outputs",
306+ # type=str,
307+ # default="outputs",
308+ # help="Folder to save output files.",
309+ # )
341310 parser .add_argument (
342311 "--reference" ,
343312 action = "store_true" ,
344313 help = "Whether to use reference protein sequence for mapping." ,
345314 )
346-
347- parser .add_argument (
348- "--assembly-mode" ,
349- type = str ,
350- choices = ["dbg" , "greedy" ],
351- help = "Assembly algorithm to use." ,
352- )
353- parser .add_argument (
354- "--kmer-size" ,
355- type = int ,
356- default = 6 ,
357- help = "K-mer size (only used if --assembly-mode dbg)." ,
358- )
359- parser .add_argument (
360- "--min-identity" ,
361- type = float ,
362- default = 0.8 ,
363- help = "Minimum identity threshold (only used if --reference)." ,
364- )
365- parser .add_argument (
366- "--max-mismatches" ,
367- type = int ,
368- default = 14 ,
369- help = "Maximum allowed mismatches (only used if --reference)." ,
370- )
371-
315+ # parser.add_argument(
316+ # "--assembly-mode",
317+ # type=str,
318+ # choices=["dbg", "greedy"],
319+ # required=True,
320+ # help="Assembly algorithm to use.",
321+ # )
322+ # parser.add_argument(
323+ # "--kmer-size",
324+ # type=int,
325+ # default=7,
326+ # help="K-mer size (only used if --assembly-mode dbg).",
327+ # )
328+ # parser.add_argument(
329+ # "--min-identity",
330+ # type=float,
331+ # default=0.8,
332+ # help="Minimum identity threshold (only used if --reference).",
333+ # )
334+ # parser.add_argument(
335+ # "--max-mismatches",
336+ # type=int,
337+ # default=14,
338+ # help="Maximum allowed mismatches (only used if --reference).",
339+ # )
372340 parser .add_argument (
373341 "--conf" ,
374342 type = float ,
375- default = 0.88 ,
343+ default = None ,
376344 help = "Confidence threshold for filtering (default: 0.88)." ,
377345 )
346+ # parser.add_argument(
347+ # "--size-threshold",
348+ # type=int,
349+ # default=10,
350+ # help="Minimum contig size threshold (default: 10).",
351+ # )
352+ # parser.add_argument(
353+ # "--min-overlap",
354+ # type=int,
355+ # default=3,
356+ # help="Minimum overlap size between reads (default: 3).",
357+ # )
378358 parser .add_argument (
379- "--size-threshold" ,
380- type = int ,
381- default = 10 ,
382- help = "Minimum contig size threshold (default: 10)." ,
359+ "--metadata-json" ,
360+ type = str ,
361+ required = True ,
362+ help = "Path to the sample_metadata.json file." ,
363+ )
364+ parser .add_argument (
365+ "--contaminants-fasta" ,
366+ type = str ,
367+ required = True ,
368+ help = "Path to the contaminants.fasta file." ,
383369 )
384370 parser .add_argument (
385- "--min-overlap " ,
386- type = int ,
387- default = 4 ,
388- help = "Minimum overlap size between reads (default: 4)." ,
371+ "--output-csv-path " ,
372+ type = str ,
373+ required = True ,
374+ help = "Path to the output CSV file."
389375 )
390376
391377 args = parser .parse_args ()
392378
393- main (
394- input_csv = args .input_csv ,
395- chain = args .chain ,
396- folder_outputs = args .folder_outputs ,
397- reference = args .reference ,
398- assembly_mode = args .assembly_mode ,
399- conf = args .conf ,
400- kmer_size = args .kmer_size ,
401- size_threshold = args .size_threshold ,
402- min_overlap = args .min_overlap ,
403- min_identity = args .min_identity ,
404- max_mismatches = args .max_mismatches
405- )
406-
379+ main (** vars (args ))
407380
408381if __name__ == "__main__" :
409382 cli ()
410383
411- # python -m instanexus.preprocessing --input-csv ../../ inputs/bsa.csv --folder-outputs ../../outputs --assembly-mode dbg --conf 0.9 --kmer-size 7 --size-threshold 12 --min-overlap 5
384+ # python -m instanexus.preprocessing --input-csv inputs/bsa.csv --metadata-json json/sample_metadata.json --contaminants-fasta fasta/contaminants.fasta --output-csv-path outputs/bsa_cleaned.csv --conf 0.9 --reference
0 commit comments