🐛 refactored preprocessing for python package

marcoreverenna · marcoreverenna · commit 5f33ddbd8b6c · 2025-11-14T13:00:20.000+01:00
diff --git a/src/instanexus/preprocessing.py b/src/instanexus/preprocessing.py
@@ -14,7 +14,7 @@
 __authors__ = Marco Reverenna
 __copyright__ = Copyright 2025-2026
 __research-group__ = DTU Biosustain (Multi-omics Network Analytics) and DTU Bioengineering
-__date__ = 29 Oct 2025
+__date__ = 13 Nov 2025
 __maintainer__ = Marco Reverenna
 __email__ = marcor@dtu.dk
 __status__ = Dev
@@ -34,10 +34,14 @@
 from Bio import SeqIO
 
 
-PROJECT_ROOT = Path(__file__).resolve().parents[2]
-JSON_DIR = PROJECT_ROOT / "json"
+#PROJECT_ROOT = Path(__file__).resolve().parents[2]
+#JSON_DIR = PROJECT_ROOT / "json"
 
-def get_sample_metadata(run, chain="", json_path=JSON_DIR / "sample_metadata.json"):
+def get_sample_metadata(run, chain="", json_path=None):
+    """Retrieve sample metadata from a JSON file based on the run and optional chain."""
+    if json_path is None:
+        raise ValueError("json_path must be provided.")
+    
     with open(json_path, "r") as f:
         all_meta = json.load(f)
 
@@ -57,29 +61,6 @@ def get_sample_metadata(run, chain="", json_path=JSON_DIR / "sample_metadata.jso
     raise ValueError(f"No metadata found for run '{run}' with chain '{chain}'.")
 
 
-# Define and create the necessary directories only if they don't exist
-def create_directory(path):
-    """Creates a directory if it does not already exist.
-    Args:
-        path (str): The path of the directory to create.
-    """
-    if not os.path.exists(path):
-        os.makedirs(path)
-        # print(f"Created: {path}")
-    # else:
-    # print(f"Already exists: {path}")
-
-
-def create_subdirectories_outputs(folder):
-    """Creates subdirectories within the specified folder.
-    Args:
-        folder (str): The path of the parent directory.
-    """
-    subdirectories = ["cleaned", "contigs", "scaffolds", "statistics"]
-    for subdirectory in subdirectories:
-        create_directory(f"{folder}/{subdirectory}")
-
-
 def normalize_sequence(sequence):
     """Normalize the given amino acid sequence by replacing all occurrences of 'I' with
     'L'.
@@ -130,24 +111,6 @@ def remove_modifications(psm_column):
     return None
 
 
-# ! needs to move once it is a package
-def test_remove_modifications():
-    assert remove_modifications("A(ox)BC(mod)D") == "ABCD"
-    assert remove_modifications("A[UNIMOD:21]BC[UNIMOD:35]D") == "ABCD"
-    assert (
-        remove_modifications("A(ox)[UNIMOD:21]BC(mod)[UNIMOD:35]D") == "ABCD"
-    )
-    assert remove_modifications(None) is None
-    assert remove_modifications("ACD") == "ACD"
-    assert remove_modifications("A(I)BCD") == "ABCD"
-    assert remove_modifications("A(ox)B(I)C(mod)D") == "ABCD"
-    assert (
-        remove_modifications("A(ox)[UNIMOD:21]B(I)C(mod)[UNIMOD:35]D") == "ABCD"
-    )
-    assert remove_modifications("AI BCD") == "AL BCD"
-    assert remove_modifications("A(ox)I B(mod)CD") == "AL BCD"
-
-
 def clean_dataframe(df):
     """Clean and preprocess a DataFrame for analysis by removing '(ox)' substrings
     from sequences in the 'seq' column. By replacing values of -1 with -10 in the
@@ -214,64 +177,65 @@ def filter_contaminants(seqs, run, contaminants_fasta):
 
 def main(
     input_csv: str,
-    chain: str = "",
-    folder_outputs: str = "outputs",
-    reference: bool = False,
-    assembly_mode = "dbg",
-    conf: float = 0.88,
-    kmer_size: int = 6,
-    size_threshold: int = 10,
-    min_overlap: int = 4,
-    min_identity: float = 0.8,
-    max_mismatches: int = 14,
+    metadata_json: str,
+    contaminants_fasta: str,
+    chain: str,
+    #folder_outputs: str,
+    reference: bool,
+    #assembly_mode: str,
+    conf: float,
+    output_csv_path: str,
+    #kmer_size: int,
+    #size_threshold: int,
+    #min_overlap: int,
+    #min_identity: float,
+    #max_mismatches: int,
 ):
     """Main function to run the preprocessing script."""
     input_csv = Path(input_csv)
 
     print("Starting preprocessing pipeline.")
 
-    run = input_csv.stem
-    repo_folder = Path(__file__).resolve().parents[2]
+    input_csv = Path(input_csv)
+    run = input_csv.stem # stem gives the filename without suffix
 
     # load metadata
     if chain:
-        meta = get_sample_metadata(run, chain=chain)
+        meta = get_sample_metadata(run, chain=chain, json_path=metadata_json)
     else:
-        meta = get_sample_metadata(run)
+        meta = get_sample_metadata(run, json_path=metadata_json)
 
     proteases = meta["proteases"]
 
     if reference:
         protein = meta["protein"]
 
-    if assembly_mode != "dbg":
-        print("Ignoring kmer_size (only relevant for dbg mode)")
-        kmer_size = None
+    # if assembly_mode != "dbg":
+    #     print("Ignoring kmer_size (only relevant for dbg mode)")
+    #     kmer_size = None
 
-    if not reference:
-        print("Ignoring min_identity and max_mismatches (only relevant when reference=True)")
-        min_identity = None
-        max_mismatches = None
+    # if not reference:
+    #     print("Ignoring min_identity and max_mismatches (only relevant when reference=True)")
+    #     min_identity = None
+    #     max_mismatches = None
 
     print("Parameters loaded.")
 
-    folder_outputs = Path(folder_outputs) / run
-    folder_outputs.mkdir(parents=True, exist_ok=True)
-
-    folder_name_parts = [f"comb_{assembly_mode}", f"c{conf}", f"ts{size_threshold}", f"mo{min_overlap}"]
+    #folder_outputs = Path(folder_outputs) / run
+    #folder_outputs.mkdir(parents=True, exist_ok=True)
 
-    if assembly_mode == "dbg":
-        folder_name_parts.insert(2, f"ks{kmer_size}")
+    #folder_name_parts = [f"comb_{assembly_mode}", f"c{conf}", f"ts{size_threshold}", f"mo{min_overlap}"]
 
-    if reference:
-        folder_name_parts.extend([f"mi{min_identity}", f"mm{max_mismatches}"])
+    # if assembly_mode == "dbg":
+    #     folder_name_parts.insert(2, f"ks{kmer_size}")
 
-    combination_folder_out = folder_outputs / "_".join(folder_name_parts)
-    create_subdirectories_outputs(combination_folder_out)
+    #if reference:
+    #    folder_name_parts.extend([f"mi{min_identity}", f"mm{max_mismatches}"])
 
-    print(f"Output folders created at: {combination_folder_out}")
+    # combination_folder_out = folder_outputs / "_".join(folder_name_parts)
+    # create_subdirectories_outputs(combination_folder_out)
 
-    # data cleaning
+    # print(f"Output folders created at: {combination_folder_out}")
 
     logger.info("Starting data cleaning...")
     
@@ -290,7 +254,7 @@ def main(
     cleaned_psms = df["cleaned_preds"].tolist()
 
     filtered_psms = filter_contaminants(
-        cleaned_psms, run, repo_folder / "fasta/contaminants.fasta"
+        cleaned_psms, run, contaminants_fasta
     )
     df = df[df["cleaned_preds"].isin(filtered_psms)]
 
@@ -300,16 +264,21 @@ def main(
         )
 
     # probably confidence trhreshold won't be necessary anymore
-    df = df[df["conf"] > conf]
+    if conf is not None:
+        logger.info(f"Applying confidence threshold: {conf}")
+        df = df[df["conf"] > conf]
+    else:
+        logger.info("No confidence threshold applied.")
     
     df.reset_index(drop=True, inplace=True)
     
     logger.info("Data cleaning completed.")
+    cleaned_csv_path = Path(output_csv_path)
+    cleaned_csv_path.parent.mkdir(parents=True, exist_ok=True)
 
-    cleaned_csv_path = combination_folder_out / "cleaned" / "cleaned_data.csv"
+    #cleaned_csv_path = combination_folder_out / "cleaned" / "cleaned_data.csv"
     
     df.to_csv(cleaned_csv_path, index=False)
-
     logger.info("Cleaned data saved to: {}.".format(cleaned_csv_path))
 
 
@@ -332,80 +301,84 @@ def cli():
         default="",
         help="Chain identifier for the sample (optional).",
     )
-    parser.add_argument(
-        "--folder-outputs",
-        type=str,
-        default="outputs",
-        help="Folder to save output files.",
-    )
+    # parser.add_argument(
+    #     "--folder-outputs",
+    #     type=str,
+    #     default="outputs",
+    #     help="Folder to save output files.",
+    # )
     parser.add_argument(
         "--reference",
         action="store_true",
         help="Whether to use reference protein sequence for mapping.",
     )
-
-    parser.add_argument(
-        "--assembly-mode",
-        type=str,
-        choices=["dbg", "greedy"],
-        help="Assembly algorithm to use.",
-    )
-    parser.add_argument(
-        "--kmer-size",
-        type=int,
-        default=6,
-        help="K-mer size (only used if --assembly-mode dbg).",
-    )
-    parser.add_argument(
-        "--min-identity",
-        type=float,
-        default=0.8,
-        help="Minimum identity threshold (only used if --reference).",
-    )
-    parser.add_argument(
-        "--max-mismatches",
-        type=int,
-        default=14,
-        help="Maximum allowed mismatches (only used if --reference).",
-    )
-
+    # parser.add_argument(
+    #     "--assembly-mode",
+    #     type=str,
+    #     choices=["dbg", "greedy"],
+    #     required=True,
+    #     help="Assembly algorithm to use.",
+    # )
+    # parser.add_argument(
+    #     "--kmer-size",
+    #     type=int,
+    #     default=7,
+    #     help="K-mer size (only used if --assembly-mode dbg).",
+    # )
+    # parser.add_argument(
+    #     "--min-identity",
+    #     type=float,
+    #     default=0.8,
+    #     help="Minimum identity threshold (only used if --reference).",
+    # )
+    # parser.add_argument(
+    #     "--max-mismatches",
+    #     type=int,
+    #     default=14,
+    #     help="Maximum allowed mismatches (only used if --reference).",
+    # )
     parser.add_argument(
         "--conf",
         type=float,
-        default=0.88,
+        default=None,
         help="Confidence threshold for filtering (default: 0.88).",
     )
+    # parser.add_argument(
+    #     "--size-threshold",
+    #     type=int,
+    #     default=10,
+    #     help="Minimum contig size threshold (default: 10).",
+    # )
+    # parser.add_argument(
+    #     "--min-overlap",
+    #     type=int,
+    #     default=3,
+    #     help="Minimum overlap size between reads (default: 3).",
+    # )
     parser.add_argument(
-        "--size-threshold",
-        type=int,
-        default=10,
-        help="Minimum contig size threshold (default: 10).",
+        "--metadata-json",
+        type=str,
+        required=True,
+        help="Path to the sample_metadata.json file.",
+    )
+    parser.add_argument(
+        "--contaminants-fasta",
+        type=str,
+        required=True,
+        help="Path to the contaminants.fasta file.",
     )
     parser.add_argument(
-        "--min-overlap",
-        type=int,
-        default=4,
-        help="Minimum overlap size between reads (default: 4).",
+        "--output-csv-path",
+        type=str,
+        required=True,
+        help="Path to the output CSV file."
     )
 
     args = parser.parse_args()
 
-    main(
-        input_csv=args.input_csv,
-        chain=args.chain,
-        folder_outputs=args.folder_outputs,
-        reference=args.reference,
-        assembly_mode=args.assembly_mode,
-        conf=args.conf,
-        kmer_size=args.kmer_size,
-        size_threshold=args.size_threshold,
-        min_overlap=args.min_overlap,
-        min_identity=args.min_identity,
-        max_mismatches=args.max_mismatches
-    )
-
+    main(**vars(args))
 
 if __name__ == "__main__":
     cli()
 
-# python -m instanexus.preprocessing --input-csv ../../inputs/bsa.csv --folder-outputs ../../outputs --assembly-mode dbg --conf 0.9 --kmer-size 7 --size-threshold 12 --min-overlap 5
+# python -m instanexus.preprocessing --input-csv inputs/bsa.csv --metadata-json json/sample_metadata.json --contaminants-fasta fasta/contaminants.fasta --output-csv-path outputs/bsa_cleaned.csv --conf 0.9 --reference