Skip to content

Commit 5f33ddb

Browse files
🐛 refactored preprocessing for python package
1 parent bc10ea1 commit 5f33ddb

File tree

1 file changed

+111
-138
lines changed

1 file changed

+111
-138
lines changed

src/instanexus/preprocessing.py

Lines changed: 111 additions & 138 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
__authors__ = Marco Reverenna
1515
__copyright__ = Copyright 2025-2026
1616
__research-group__ = DTU Biosustain (Multi-omics Network Analytics) and DTU Bioengineering
17-
__date__ = 29 Oct 2025
17+
__date__ = 13 Nov 2025
1818
__maintainer__ = Marco Reverenna
1919
__email__ = [email protected]
2020
__status__ = Dev
@@ -34,10 +34,14 @@
3434
from Bio import SeqIO
3535

3636

37-
PROJECT_ROOT = Path(__file__).resolve().parents[2]
38-
JSON_DIR = PROJECT_ROOT / "json"
37+
#PROJECT_ROOT = Path(__file__).resolve().parents[2]
38+
#JSON_DIR = PROJECT_ROOT / "json"
3939

40-
def get_sample_metadata(run, chain="", json_path=JSON_DIR / "sample_metadata.json"):
40+
def get_sample_metadata(run, chain="", json_path=None):
41+
"""Retrieve sample metadata from a JSON file based on the run and optional chain."""
42+
if json_path is None:
43+
raise ValueError("json_path must be provided.")
44+
4145
with open(json_path, "r") as f:
4246
all_meta = json.load(f)
4347

@@ -57,29 +61,6 @@ def get_sample_metadata(run, chain="", json_path=JSON_DIR / "sample_metadata.jso
5761
raise ValueError(f"No metadata found for run '{run}' with chain '{chain}'.")
5862

5963

60-
# Define and create the necessary directories only if they don't exist
61-
def create_directory(path):
62-
"""Creates a directory if it does not already exist.
63-
Args:
64-
path (str): The path of the directory to create.
65-
"""
66-
if not os.path.exists(path):
67-
os.makedirs(path)
68-
# print(f"Created: {path}")
69-
# else:
70-
# print(f"Already exists: {path}")
71-
72-
73-
def create_subdirectories_outputs(folder):
74-
"""Creates subdirectories within the specified folder.
75-
Args:
76-
folder (str): The path of the parent directory.
77-
"""
78-
subdirectories = ["cleaned", "contigs", "scaffolds", "statistics"]
79-
for subdirectory in subdirectories:
80-
create_directory(f"{folder}/{subdirectory}")
81-
82-
8364
def normalize_sequence(sequence):
8465
"""Normalize the given amino acid sequence by replacing all occurrences of 'I' with
8566
'L'.
@@ -130,24 +111,6 @@ def remove_modifications(psm_column):
130111
return None
131112

132113

133-
# ! needs to move once it is a package
134-
def test_remove_modifications():
135-
assert remove_modifications("A(ox)BC(mod)D") == "ABCD"
136-
assert remove_modifications("A[UNIMOD:21]BC[UNIMOD:35]D") == "ABCD"
137-
assert (
138-
remove_modifications("A(ox)[UNIMOD:21]BC(mod)[UNIMOD:35]D") == "ABCD"
139-
)
140-
assert remove_modifications(None) is None
141-
assert remove_modifications("ACD") == "ACD"
142-
assert remove_modifications("A(I)BCD") == "ABCD"
143-
assert remove_modifications("A(ox)B(I)C(mod)D") == "ABCD"
144-
assert (
145-
remove_modifications("A(ox)[UNIMOD:21]B(I)C(mod)[UNIMOD:35]D") == "ABCD"
146-
)
147-
assert remove_modifications("AI BCD") == "AL BCD"
148-
assert remove_modifications("A(ox)I B(mod)CD") == "AL BCD"
149-
150-
151114
def clean_dataframe(df):
152115
"""Clean and preprocess a DataFrame for analysis by removing '(ox)' substrings
153116
from sequences in the 'seq' column. By replacing values of -1 with -10 in the
@@ -214,64 +177,65 @@ def filter_contaminants(seqs, run, contaminants_fasta):
214177

215178
def main(
216179
input_csv: str,
217-
chain: str = "",
218-
folder_outputs: str = "outputs",
219-
reference: bool = False,
220-
assembly_mode = "dbg",
221-
conf: float = 0.88,
222-
kmer_size: int = 6,
223-
size_threshold: int = 10,
224-
min_overlap: int = 4,
225-
min_identity: float = 0.8,
226-
max_mismatches: int = 14,
180+
metadata_json: str,
181+
contaminants_fasta: str,
182+
chain: str,
183+
#folder_outputs: str,
184+
reference: bool,
185+
#assembly_mode: str,
186+
conf: float,
187+
output_csv_path: str,
188+
#kmer_size: int,
189+
#size_threshold: int,
190+
#min_overlap: int,
191+
#min_identity: float,
192+
#max_mismatches: int,
227193
):
228194
"""Main function to run the preprocessing script."""
229195
input_csv = Path(input_csv)
230196

231197
print("Starting preprocessing pipeline.")
232198

233-
run = input_csv.stem
234-
repo_folder = Path(__file__).resolve().parents[2]
199+
input_csv = Path(input_csv)
200+
run = input_csv.stem # stem gives the filename without suffix
235201

236202
# load metadata
237203
if chain:
238-
meta = get_sample_metadata(run, chain=chain)
204+
meta = get_sample_metadata(run, chain=chain, json_path=metadata_json)
239205
else:
240-
meta = get_sample_metadata(run)
206+
meta = get_sample_metadata(run, json_path=metadata_json)
241207

242208
proteases = meta["proteases"]
243209

244210
if reference:
245211
protein = meta["protein"]
246212

247-
if assembly_mode != "dbg":
248-
print("Ignoring kmer_size (only relevant for dbg mode)")
249-
kmer_size = None
213+
# if assembly_mode != "dbg":
214+
# print("Ignoring kmer_size (only relevant for dbg mode)")
215+
# kmer_size = None
250216

251-
if not reference:
252-
print("Ignoring min_identity and max_mismatches (only relevant when reference=True)")
253-
min_identity = None
254-
max_mismatches = None
217+
# if not reference:
218+
# print("Ignoring min_identity and max_mismatches (only relevant when reference=True)")
219+
# min_identity = None
220+
# max_mismatches = None
255221

256222
print("Parameters loaded.")
257223

258-
folder_outputs = Path(folder_outputs) / run
259-
folder_outputs.mkdir(parents=True, exist_ok=True)
260-
261-
folder_name_parts = [f"comb_{assembly_mode}", f"c{conf}", f"ts{size_threshold}", f"mo{min_overlap}"]
224+
#folder_outputs = Path(folder_outputs) / run
225+
#folder_outputs.mkdir(parents=True, exist_ok=True)
262226

263-
if assembly_mode == "dbg":
264-
folder_name_parts.insert(2, f"ks{kmer_size}")
227+
#folder_name_parts = [f"comb_{assembly_mode}", f"c{conf}", f"ts{size_threshold}", f"mo{min_overlap}"]
265228

266-
if reference:
267-
folder_name_parts.extend([f"mi{min_identity}", f"mm{max_mismatches}"])
229+
# if assembly_mode == "dbg":
230+
# folder_name_parts.insert(2, f"ks{kmer_size}")
268231

269-
combination_folder_out = folder_outputs / "_".join(folder_name_parts)
270-
create_subdirectories_outputs(combination_folder_out)
232+
#if reference:
233+
# folder_name_parts.extend([f"mi{min_identity}", f"mm{max_mismatches}"])
271234

272-
print(f"Output folders created at: {combination_folder_out}")
235+
# combination_folder_out = folder_outputs / "_".join(folder_name_parts)
236+
# create_subdirectories_outputs(combination_folder_out)
273237

274-
# data cleaning
238+
# print(f"Output folders created at: {combination_folder_out}")
275239

276240
logger.info("Starting data cleaning...")
277241

@@ -290,7 +254,7 @@ def main(
290254
cleaned_psms = df["cleaned_preds"].tolist()
291255

292256
filtered_psms = filter_contaminants(
293-
cleaned_psms, run, repo_folder / "fasta/contaminants.fasta"
257+
cleaned_psms, run, contaminants_fasta
294258
)
295259
df = df[df["cleaned_preds"].isin(filtered_psms)]
296260

@@ -300,16 +264,21 @@ def main(
300264
)
301265

302266
# probably confidence trhreshold won't be necessary anymore
303-
df = df[df["conf"] > conf]
267+
if conf is not None:
268+
logger.info(f"Applying confidence threshold: {conf}")
269+
df = df[df["conf"] > conf]
270+
else:
271+
logger.info("No confidence threshold applied.")
304272

305273
df.reset_index(drop=True, inplace=True)
306274

307275
logger.info("Data cleaning completed.")
276+
cleaned_csv_path = Path(output_csv_path)
277+
cleaned_csv_path.parent.mkdir(parents=True, exist_ok=True)
308278

309-
cleaned_csv_path = combination_folder_out / "cleaned" / "cleaned_data.csv"
279+
#cleaned_csv_path = combination_folder_out / "cleaned" / "cleaned_data.csv"
310280

311281
df.to_csv(cleaned_csv_path, index=False)
312-
313282
logger.info("Cleaned data saved to: {}.".format(cleaned_csv_path))
314283

315284

@@ -332,80 +301,84 @@ def cli():
332301
default="",
333302
help="Chain identifier for the sample (optional).",
334303
)
335-
parser.add_argument(
336-
"--folder-outputs",
337-
type=str,
338-
default="outputs",
339-
help="Folder to save output files.",
340-
)
304+
# parser.add_argument(
305+
# "--folder-outputs",
306+
# type=str,
307+
# default="outputs",
308+
# help="Folder to save output files.",
309+
# )
341310
parser.add_argument(
342311
"--reference",
343312
action="store_true",
344313
help="Whether to use reference protein sequence for mapping.",
345314
)
346-
347-
parser.add_argument(
348-
"--assembly-mode",
349-
type=str,
350-
choices=["dbg", "greedy"],
351-
help="Assembly algorithm to use.",
352-
)
353-
parser.add_argument(
354-
"--kmer-size",
355-
type=int,
356-
default=6,
357-
help="K-mer size (only used if --assembly-mode dbg).",
358-
)
359-
parser.add_argument(
360-
"--min-identity",
361-
type=float,
362-
default=0.8,
363-
help="Minimum identity threshold (only used if --reference).",
364-
)
365-
parser.add_argument(
366-
"--max-mismatches",
367-
type=int,
368-
default=14,
369-
help="Maximum allowed mismatches (only used if --reference).",
370-
)
371-
315+
# parser.add_argument(
316+
# "--assembly-mode",
317+
# type=str,
318+
# choices=["dbg", "greedy"],
319+
# required=True,
320+
# help="Assembly algorithm to use.",
321+
# )
322+
# parser.add_argument(
323+
# "--kmer-size",
324+
# type=int,
325+
# default=7,
326+
# help="K-mer size (only used if --assembly-mode dbg).",
327+
# )
328+
# parser.add_argument(
329+
# "--min-identity",
330+
# type=float,
331+
# default=0.8,
332+
# help="Minimum identity threshold (only used if --reference).",
333+
# )
334+
# parser.add_argument(
335+
# "--max-mismatches",
336+
# type=int,
337+
# default=14,
338+
# help="Maximum allowed mismatches (only used if --reference).",
339+
# )
372340
parser.add_argument(
373341
"--conf",
374342
type=float,
375-
default=0.88,
343+
default=None,
376344
help="Confidence threshold for filtering (default: 0.88).",
377345
)
346+
# parser.add_argument(
347+
# "--size-threshold",
348+
# type=int,
349+
# default=10,
350+
# help="Minimum contig size threshold (default: 10).",
351+
# )
352+
# parser.add_argument(
353+
# "--min-overlap",
354+
# type=int,
355+
# default=3,
356+
# help="Minimum overlap size between reads (default: 3).",
357+
# )
378358
parser.add_argument(
379-
"--size-threshold",
380-
type=int,
381-
default=10,
382-
help="Minimum contig size threshold (default: 10).",
359+
"--metadata-json",
360+
type=str,
361+
required=True,
362+
help="Path to the sample_metadata.json file.",
363+
)
364+
parser.add_argument(
365+
"--contaminants-fasta",
366+
type=str,
367+
required=True,
368+
help="Path to the contaminants.fasta file.",
383369
)
384370
parser.add_argument(
385-
"--min-overlap",
386-
type=int,
387-
default=4,
388-
help="Minimum overlap size between reads (default: 4).",
371+
"--output-csv-path",
372+
type=str,
373+
required=True,
374+
help="Path to the output CSV file."
389375
)
390376

391377
args = parser.parse_args()
392378

393-
main(
394-
input_csv=args.input_csv,
395-
chain=args.chain,
396-
folder_outputs=args.folder_outputs,
397-
reference=args.reference,
398-
assembly_mode=args.assembly_mode,
399-
conf=args.conf,
400-
kmer_size=args.kmer_size,
401-
size_threshold=args.size_threshold,
402-
min_overlap=args.min_overlap,
403-
min_identity=args.min_identity,
404-
max_mismatches=args.max_mismatches
405-
)
406-
379+
main(**vars(args))
407380

408381
if __name__ == "__main__":
409382
cli()
410383

411-
# python -m instanexus.preprocessing --input-csv ../../inputs/bsa.csv --folder-outputs ../../outputs --assembly-mode dbg --conf 0.9 --kmer-size 7 --size-threshold 12 --min-overlap 5
384+
# python -m instanexus.preprocessing --input-csv inputs/bsa.csv --metadata-json json/sample_metadata.json --contaminants-fasta fasta/contaminants.fasta --output-csv-path outputs/bsa_cleaned.csv --conf 0.9 --reference

0 commit comments

Comments
 (0)