diff --git a/docs/source/conf.py b/docs/source/conf.py
index 488bea2..309f49e 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -6,23 +6,22 @@
 # -- Project information -----------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 
-project = 'InstaNexus'
-copyright = '2025, Marco Reverenna'
-author = 'Marco Reverenna'
-release = '0.2.0'
+project = "InstaNexus"
+copyright = "2025, Marco Reverenna"
+author = "Marco Reverenna"
+release = "0.2.0"
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
 
 extensions = []
 
-templates_path = ['_templates']
+templates_path = ["_templates"]
 exclude_patterns = []
 
 
-
 # -- Options for HTML output -------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
 
-html_theme = 'alabaster'
-html_static_path = ['_static']
+html_theme = "alabaster"
+html_static_path = ["_static"]
diff --git a/docs/source/tutorials/case_studies/prot_optimization_dbg.ipynb b/docs/source/tutorials/case_studies/prot_optimization_dbg.ipynb
index ecb62aa..7b539a9 100755
--- a/docs/source/tutorials/case_studies/prot_optimization_dbg.ipynb
+++ b/docs/source/tutorials/case_studies/prot_optimization_dbg.ipynb
@@ -519,7 +519,7 @@
     "        sequence_type=\"contigs\",\n",
     "        output_folder=RESULTS_DIR,\n",
     "        reference=protein_norm,\n",
-    "        **params\n",
+    "        **params,\n",
     "    )\n",
     "\n",
     "    coverage_contigs = stat_contigs.get(\"coverage\")\n",
@@ -553,7 +553,7 @@
     "        sequence_type=\"scaffolds\",\n",
     "        output_folder=RESULTS_DIR,\n",
     "        reference=protein_norm,\n",
-    "        **params\n",
+    "        **params,\n",
     "    )\n",
     "\n",
     "    coverage_scaffolds = stat_scaffolds.get(\"coverage\")\n",
@@ -814,7 +814,7 @@
     "        sequence_type=\"contigs\",\n",
     "        output_folder=RESULTS_DIR,\n",
     "        reference=protein_norm,\n",
-    "        **params\n",
+    "        **params,\n",
     "    )\n",
     "    coverage_contigs = stat_contigs.get(\"coverage\")\n",
     "\n",
@@ -847,7 +847,7 @@
     "        sequence_type=\"scaffolds\",\n",
     "        output_folder=RESULTS_DIR,\n",
     "        reference=protein_norm,\n",
-    "        **params\n",
+    "        **params,\n",
     "    )\n",
     "\n",
     "    coverage_scaffolds = stat_scaffolds.get(\"coverage\")\n",
diff --git a/docs/source/tutorials/case_studies/prot_optimization_greedy.ipynb b/docs/source/tutorials/case_studies/prot_optimization_greedy.ipynb
index cd5d149..cfbbba7 100755
--- a/docs/source/tutorials/case_studies/prot_optimization_greedy.ipynb
+++ b/docs/source/tutorials/case_studies/prot_optimization_greedy.ipynb
@@ -508,7 +508,7 @@
     "        sequence_type=\"contigs\",\n",
     "        output_folder=\".\",\n",
     "        reference=protein_norm,\n",
-    "        **params\n",
+    "        **params,\n",
     "    )\n",
     "\n",
     "    coverage_contigs = stat_contigs.get(\"coverage\")\n",
@@ -556,7 +556,7 @@
     "        sequence_type=\"scaffolds\",\n",
     "        output_folder=\".\",\n",
     "        reference=protein_norm,\n",
-    "        **params\n",
+    "        **params,\n",
     "    )\n",
     "    coverage_scaffolds = stat_scaffolds.get(\"coverage\")\n",
     "\n",
@@ -740,7 +740,7 @@
     "        sequence_type=\"contigs\",\n",
     "        output_folder=\".\",\n",
     "        reference=protein_norm,\n",
-    "        **params\n",
+    "        **params,\n",
     "    )\n",
     "    coverage_contigs = stat_contigs.get(\"coverage\")\n",
     "\n",
@@ -784,7 +784,7 @@
     "        sequence_type=\"scaffolds\",\n",
     "        output_folder=\".\",\n",
     "        reference=protein_norm,\n",
-    "        **params\n",
+    "        **params,\n",
     "    )\n",
     "    coverage_scaffolds = stat_scaffolds.get(\"coverage\")\n",
     "\n",
diff --git a/docs/source/tutorials/examples/dbg_variants_workflow.ipynb b/docs/source/tutorials/examples/dbg_variants_workflow.ipynb
index c046c11..14677fd 100644
--- a/docs/source/tutorials/examples/dbg_variants_workflow.ipynb
+++ b/docs/source/tutorials/examples/dbg_variants_workflow.ipynb
@@ -48,7 +48,7 @@
    "outputs": [],
    "source": [
     "# read a pre cleaned data file\n",
-    "#data = pd.read_csv(\"../outputs/bsa/comb_dbg_c0.9_ks7_ts12_mo3/cleaned/cleaned_data.csv\")"
+    "# data = pd.read_csv(\"../outputs/bsa/comb_dbg_c0.9_ks7_ts12_mo3/cleaned/cleaned_data.csv\")"
    ]
   },
   {
@@ -62,9 +62,9 @@
     "\n",
     "import re\n",
     "\n",
-    "file_name = 'bsa'\n",
+    "file_name = \"bsa\"\n",
     "\n",
-    "data = pd.read_csv(f'../inputs/{file_name}.csv'.format(file_name=file_name))\n",
+    "data = pd.read_csv(f\"../inputs/{file_name}.csv\".format(file_name=file_name))\n",
     "\n",
     "data[\"log_probs\"] = data[\"log_probs\"].replace(-1, -10)\n",
     "\n",
@@ -99,13 +99,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from pathlib import Path\n",
     "\n",
     "repo_folder = Path(\"../\")\n",
     "\n",
     "filtered_psms = instanexus.preprocessing.filter_contaminants(\n",
-    "        cleaned_psms, run, repo_folder / \"fasta/contaminants.fasta\"\n",
-    "    )\n",
+    "    cleaned_psms, run, repo_folder / \"fasta/contaminants.fasta\"\n",
+    ")\n",
     "\n",
     "data = data[data[\"preds\"].isin(filtered_psms)]"
    ]
@@ -158,10 +157,10 @@
    "source": [
     "assembler = Assembler(\n",
     "    mode=\"dbg_weighted\",\n",
-    "    kmer_size=7,          \n",
-    "    size_threshold=0,    \n",
-    "    min_weight=2,         # filter low-weight edges\n",
-    "    refine_rounds=3,      # optional iterative refinement\n",
+    "    kmer_size=7,\n",
+    "    size_threshold=0,\n",
+    "    min_weight=2,  # filter low-weight edges\n",
+    "    refine_rounds=3,  # optional iterative refinement\n",
     ")"
    ]
   },
@@ -172,7 +171,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "scaffolds_dbg_w = assembler.run(sequences, output_folder=output_folder, protein_norm=None)"
+    "scaffolds_dbg_w = assembler.run(\n",
+    "    sequences, output_folder=output_folder, protein_norm=None\n",
+    ")"
    ]
   },
   {
@@ -242,7 +243,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "mapped_contigs = map.process_protein_contigs_scaffold(scaffolds_dbg_w, protein_norm, max_mismatches = 10, min_identity = 0.8)"
+    "mapped_contigs = map.process_protein_contigs_scaffold(\n",
+    "    scaffolds_dbg_w, protein_norm, max_mismatches=10, min_identity=0.8\n",
+    ")"
    ]
   },
   {
@@ -338,8 +341,8 @@
     "assembler_dbgx = Assembler(\n",
     "    mode=\"dbgX\",\n",
     "    kmer_size=7,\n",
-    "    size_threshold=10,     \n",
-    "    min_weight=2,         \n",
+    "    size_threshold=10,\n",
+    "    min_weight=2,\n",
     ")"
    ]
   },
@@ -351,9 +354,7 @@
    "outputs": [],
    "source": [
     "scaffolds_dbgx = assembler_dbgx.run(\n",
-    "    sequences=sequences,\n",
-    "    output_folder=output_folder,\n",
-    "    protein_norm=None\n",
+    "    sequences=sequences, output_folder=output_folder, protein_norm=None\n",
     ")"
    ]
   },
@@ -364,7 +365,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "mapped_scaffolds_dbgx = map.process_protein_contigs_scaffold(scaffolds_dbgx, protein_norm, max_mismatches = 10, min_identity = 0.8)"
+    "mapped_scaffolds_dbgx = map.process_protein_contigs_scaffold(\n",
+    "    scaffolds_dbgx, protein_norm, max_mismatches=10, min_identity=0.8\n",
+    ")"
    ]
   },
   {
@@ -427,7 +430,7 @@
     "    mode=\"fusion\",\n",
     "    kmer_size=7,\n",
     "    size_threshold=10,\n",
-    "    min_overlap=3,    \n",
+    "    min_overlap=3,\n",
     "    min_weight=2,\n",
     ")"
    ]
@@ -450,9 +453,7 @@
    "outputs": [],
    "source": [
     "scaffolds_fusion = assembler_fusion.run(\n",
-    "    sequences=sequences,\n",
-    "    output_folder=output_folder_fusion,\n",
-    "    protein_norm=None\n",
+    "    sequences=sequences, output_folder=output_folder_fusion, protein_norm=None\n",
     ")"
    ]
   },
@@ -463,7 +464,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "mapped_scaffolds_fusion = map.process_protein_contigs_scaffold(scaffolds_fusion, protein_norm, max_mismatches=10, min_identity=0.8)\n",
+    "mapped_scaffolds_fusion = map.process_protein_contigs_scaffold(\n",
+    "    scaffolds_fusion, protein_norm, max_mismatches=10, min_identity=0.8\n",
+    ")\n",
     "\n",
     "# top 20\n",
     "mapped_scaffolds_fusion = mapped_scaffolds_fusion[:20]"
diff --git a/docs/source/tutorials/examples/hybrid_workflow_with_figures.ipynb b/docs/source/tutorials/examples/hybrid_workflow_with_figures.ipynb
index de904df..afbbea0 100644
--- a/docs/source/tutorials/examples/hybrid_workflow_with_figures.ipynb
+++ b/docs/source/tutorials/examples/hybrid_workflow_with_figures.ipynb
@@ -39,19 +39,17 @@
     "import alignment as align\n",
     "import clustering as clus\n",
     "import preprocessing as prep\n",
-    "import compute_statistics as comp_stat\n",
-    "#import model_peptide_selector as selector\n",
+    "\n",
+    "# import model_peptide_selector as selector\n",
     "\n",
     "# import libraries\n",
     "from pathlib import Path\n",
     "from Bio import SeqIO\n",
     "\n",
-    "#import joblib\n",
+    "# import joblib\n",
     "import json\n",
     "import Bio\n",
-    "import pandas as pd\n",
-    "import matplotlib.pyplot as plt\n",
-    "import seaborn as sns"
+    "import pandas as pd"
    ]
   },
   {
@@ -131,16 +129,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def get_combination_name(\n",
-    "    ass_method,\n",
-    "    conf,\n",
-    "    kmer_size,\n",
-    "    size_threshold,\n",
-    "    min_overlap\n",
-    "):\n",
+    "def get_combination_name(ass_method, conf, kmer_size, size_threshold, min_overlap):\n",
     "    if ass_method in (\"dbg\", \"hybrid\"):\n",
     "        return f\"comb_{ass_method}_c{conf}_ks{kmer_size}_ts{size_threshold}_mo{min_overlap}\"\n",
-    "    \n",
+    "\n",
     "    elif ass_method == \"greedy\":\n",
     "        return f\"comb_{ass_method}_c{conf}_ts{size_threshold}_mo{min_overlap}\""
    ]
@@ -186,12 +178,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "comb = get_combination_name(\n",
-    "    ass_method,\n",
-    "    conf,\n",
-    "    kmer_size,\n",
-    "    size_threshold,\n",
-    "    min_overlap)\n",
+    "comb = get_combination_name(ass_method, conf, kmer_size, size_threshold, min_overlap)\n",
     "\n",
     "print(comb)"
    ]
@@ -207,7 +194,7 @@
     "    \"ass_method\": ass_method,\n",
     "    \"conf\": conf,\n",
     "    \"size_threshold\": size_threshold,\n",
-    "    \"min_overlap\": min_overlap\n",
+    "    \"min_overlap\": min_overlap,\n",
     "}"
    ]
   },
@@ -346,7 +333,9 @@
     "    filtered_df = df[mask].copy()\n",
     "    removed_count = (~mask).sum()\n",
     "\n",
-    "    print(f\"Removed {removed_count} contaminant sequences, {len(filtered_df)} remaining.\")\n",
+    "    print(\n",
+    "        f\"Removed {removed_count} contaminant sequences, {len(filtered_df)} remaining.\"\n",
+    "    )\n",
     "    return filtered_df"
    ]
   },
@@ -492,9 +481,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "greedy_scaffolds = greedy.scaffold_iterative_greedy(assembled_contigs,\n",
-    "                                                   min_overlap,\n",
-    "                                                   size_threshold)"
+    "greedy_scaffolds = greedy.scaffold_iterative_greedy(\n",
+    "    assembled_contigs, min_overlap, size_threshold\n",
+    ")"
    ]
   },
   {
@@ -655,10 +644,12 @@
    "outputs": [],
    "source": [
     "mapped_scaffolds = map.process_protein_contigs_scaffold(\n",
-    "    all_scaffolds, protein_norm, max_mismatches = 0, min_identity = 0.90\n",
+    "    all_scaffolds, protein_norm, max_mismatches=0, min_identity=0.90\n",
     ")\n",
     "\n",
-    "map.mapping_substitutions(mapped_scaffolds, protein_norm, title= \"scaffolds mapped in RF-selected peptides\")"
+    "map.mapping_substitutions(\n",
+    "    mapped_scaffolds, protein_norm, title=\"scaffolds mapped in RF-selected peptides\"\n",
+    ")"
    ]
   },
   {
@@ -754,14 +745,14 @@
    "source": [
     "clus.cluster_fasta_files(input_folder=str(scaffolds_folder_out))\n",
     "\n",
-    "fasta_input = scaffolds_folder_out / f\"scaffolds.fasta\"\n",
+    "fasta_input = scaffolds_folder_out / \"scaffolds.fasta\"\n",
     "\n",
     "cluster_tsv_folder = clustering_out / run_id\n",
-    "    \n",
+    "\n",
     "clus.process_fasta_and_clusters(\n",
-    "        fasta_file=str(fasta_input),\n",
-    "        input_folder=str(scaffolds_folder_out),\n",
-    "        )"
+    "    fasta_file=str(fasta_input),\n",
+    "    input_folder=str(scaffolds_folder_out),\n",
+    ")"
    ]
   },
   {
@@ -798,10 +789,10 @@
    "outputs": [],
    "source": [
     "cons.process_alignment_files(\n",
-    "        align_folder=str(alignment_out),\n",
-    "        output_folder=str(consensus_out),\n",
-    "        run_id=run_id,\n",
-    "    )"
+    "    align_folder=str(alignment_out),\n",
+    "    output_folder=str(consensus_out),\n",
+    "    run_id=run_id,\n",
+    ")"
    ]
   }
  ],
diff --git a/docs/source/tutorials/examples/hybrid_workflow_with_selector.ipynb b/docs/source/tutorials/examples/hybrid_workflow_with_selector.ipynb
index ddf204f..8392977 100644
--- a/docs/source/tutorials/examples/hybrid_workflow_with_selector.ipynb
+++ b/docs/source/tutorials/examples/hybrid_workflow_with_selector.ipynb
@@ -44,7 +44,6 @@
     "import alignment as align\n",
     "import clustering as clus\n",
     "import preprocessing as prep\n",
-    "import compute_statistics as comp_stat\n",
     "import model_peptide_selector as selector\n",
     "\n",
     "# import libraries\n",
@@ -135,16 +134,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def get_combination_name(\n",
-    "    ass_method,\n",
-    "    conf,\n",
-    "    kmer_size,\n",
-    "    size_threshold,\n",
-    "    min_overlap\n",
-    "):\n",
+    "def get_combination_name(ass_method, conf, kmer_size, size_threshold, min_overlap):\n",
     "    if ass_method in (\"dbg\", \"hybrid\"):\n",
     "        return f\"comb_{ass_method}_c{conf}_ks{kmer_size}_ts{size_threshold}_mo{min_overlap}\"\n",
-    "    \n",
+    "\n",
     "    elif ass_method == \"greedy\":\n",
     "        return f\"comb_{ass_method}_c{conf}_ts{size_threshold}_mo{min_overlap}\""
    ]
@@ -207,12 +200,7 @@
     }
    ],
    "source": [
-    "comb = get_combination_name(\n",
-    "    ass_method,\n",
-    "    conf,\n",
-    "    kmer_size,\n",
-    "    size_threshold,\n",
-    "    min_overlap)\n",
+    "comb = get_combination_name(ass_method, conf, kmer_size, size_threshold, min_overlap)\n",
     "\n",
     "print(comb)"
    ]
@@ -228,7 +216,7 @@
     "    \"ass_method\": ass_method,\n",
     "    \"conf\": conf,\n",
     "    \"size_threshold\": size_threshold,\n",
-    "    \"min_overlap\": min_overlap\n",
+    "    \"min_overlap\": min_overlap,\n",
     "}"
    ]
   },
@@ -410,7 +398,7 @@
     }
    ],
    "source": [
-    "#protein_norm = prep.normalize_sequence(protein)\n",
+    "# protein_norm = prep.normalize_sequence(protein)\n",
     "\n",
     "df = pd.read_csv(INPUT_DIR / f\"{run}.csv\")\n",
     "\n",
@@ -471,7 +459,9 @@
     }
    ],
    "source": [
-    "df[df.duplicated(subset=[\"cleaned_preds\"], keep=False)][[\"cleaned_preds\"]].value_counts()"
+    "df[df.duplicated(subset=[\"cleaned_preds\"], keep=False)][\n",
+    "    [\"cleaned_preds\"]\n",
+    "].value_counts()"
    ]
   },
   {
@@ -500,7 +490,12 @@
     }
    ],
    "source": [
-    "print(df.loc[df[\"cleaned_preds\"].str.len().sort_values(ascending=False).index, \"cleaned_preds\"])"
+    "print(\n",
+    "    df.loc[\n",
+    "        df[\"cleaned_preds\"].str.len().sort_values(ascending=False).index,\n",
+    "        \"cleaned_preds\",\n",
+    "    ]\n",
+    ")"
    ]
   },
   {
@@ -543,7 +538,7 @@
    "outputs": [],
    "source": [
     "filtered_psms = prep.filter_contaminants(\n",
-    "cleaned_psms, run, FASTA_DIR / \"contaminants.fasta\"\n",
+    "    cleaned_psms, run, FASTA_DIR / \"contaminants.fasta\"\n",
     ")"
    ]
   },
@@ -600,7 +595,7 @@
     "aa_props = selector.load_aa_properties(JSON_DIR / \"aa_properties.json\")\n",
     "protease_rules = selector.load_protease_rules(JSON_DIR / \"protease_rules.json\")\n",
     "\n",
-    "df =selector.build_reference_free_features(df, aa_props, protease_rules)"
+    "df = selector.build_reference_free_features(df, aa_props, protease_rules)"
    ]
   },
   {
@@ -667,7 +662,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(f\"Total PSMs: {len(df)}, Accepted: {len(accepted)}, Rejected: {len(df) - len(accepted)}\")"
+    "print(\n",
+    "    f\"Total PSMs: {len(df)}, Accepted: {len(accepted)}, Rejected: {len(df) - len(accepted)}\"\n",
+    ")"
    ]
   },
   {
@@ -703,12 +700,14 @@
     "    # Force legend to appear with proper labels\n",
     "    handles, labels = ax.get_legend_handles_labels()\n",
     "    if handles and labels:\n",
-    "        ax.legend(handles=handles, labels=labels, title=None, frameon=False, loc=\"upper right\")\n",
+    "        ax.legend(\n",
+    "            handles=handles, labels=labels, title=None, frameon=False, loc=\"upper right\"\n",
+    "        )\n",
     "\n",
     "    plt.tight_layout()\n",
     "    Path(output_dir).mkdir(parents=True, exist_ok=True)\n",
-    "    #plt.savefig(Path(output_dir) / \"peptide_length_distribution.svg\", format=\"svg\")\n",
-    "    plt.show()\n"
+    "    # plt.savefig(Path(output_dir) / \"peptide_length_distribution.svg\", format=\"svg\")\n",
+    "    plt.show()"
    ]
   },
   {
@@ -759,7 +758,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "unique_peptides = list(set(final_psms)) # remove duplicates\n",
+    "unique_peptides = list(set(final_psms))  # remove duplicates\n",
     "print(len(unique_peptides))"
    ]
   },
@@ -775,7 +774,7 @@
     "\n",
     "for pep in sorted_peps:\n",
     "    if not any(pep in other for other in non_redundant):\n",
-    "        non_redundant.append(pep)\n"
+    "        non_redundant.append(pep)"
    ]
   },
   {
@@ -852,7 +851,7 @@
     "        cluster_sorted = sorted(cluster, key=lambda x: (freq[x], len(x)), reverse=True)\n",
     "        representatives.append(cluster_sorted[0])\n",
     "\n",
-    "    return representatives, clusters\n"
+    "    return representatives, clusters"
    ]
   },
   {
@@ -862,11 +861,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "representatives, clusters = cluster_peptides_by_identity_same_length(filtered, threshold=0.9)\n",
+    "representatives, clusters = cluster_peptides_by_identity_same_length(\n",
+    "    filtered, threshold=0.9\n",
+    ")\n",
     "\n",
     "print(f\"Original peptides: {len(filtered)}\")\n",
     "print(f\"Clusters found: {len(clusters)}\")\n",
-    "print(f\"Non-redundant peptides: {len(representatives)}\")\n"
+    "print(f\"Non-redundant peptides: {len(representatives)}\")"
    ]
   },
   {
@@ -908,7 +909,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "map.mapping_substitutions(mapped_psms, protein_norm, title= \"psms mapped in RF-selected peptides\")"
+    "map.mapping_substitutions(\n",
+    "    mapped_psms, protein_norm, title=\"psms mapped in RF-selected peptides\"\n",
+    ")"
    ]
   },
   {
@@ -918,7 +921,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def plot_confidence_distribution(df, accepted_mask, output_dir, filename=\"psm_selected_rf_distribution.svg\"):\n",
+    "def plot_confidence_distribution(\n",
+    "    df, accepted_mask, output_dir, filename=\"psm_selected_rf_distribution.svg\"\n",
+    "):\n",
     "    \"\"\"\n",
     "    Plot and save the distribution of peptide confidence scores using Seaborn.\"\"\"\n",
     "\n",
@@ -1011,7 +1016,7 @@
     "    plt.close()\n",
     "\n",
     "    print(f\"Peptide length distribution saved as: {out_path}\")\n",
-    "    return out_path\n"
+    "    return out_path"
    ]
   },
   {
@@ -1162,7 +1167,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "map.mapping_substitutions(mapped_contigs, protein_norm, title= \"psms mapped in RF-selected peptides\")"
+    "map.mapping_substitutions(\n",
+    "    mapped_contigs, protein_norm, title=\"psms mapped in RF-selected peptides\"\n",
+    ")"
    ]
   },
   {
@@ -1374,7 +1381,9 @@
     "    assembled_scaffolds, protein_norm, max_mismatches, min_identity\n",
     ")\n",
     "\n",
-    "map.mapping_substitutions(mapped_scaffolds, protein_norm, title= \"scaffolds mapped in RF-selected peptides\")"
+    "map.mapping_substitutions(\n",
+    "    mapped_scaffolds, protein_norm, title=\"scaffolds mapped in RF-selected peptides\"\n",
+    ")"
    ]
   },
   {
diff --git a/pyproject.toml b/pyproject.toml
index 94773b5..2be9520 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,6 +9,11 @@ packages = ["src/instanexus"]
 name = "instanexus"
 version = "0.2.0"
 description = "End-to-end workflow for de novo protein sequencing based on InstaNovo"
+
+authors = [
+  { name = "Marco Reverenna", email = "marcor@dtu.dk" }
+]
+
 readme = "README.md"
 readme-content-type = "text/markdown"
 requires-python = ">=3.10"
@@ -56,7 +61,3 @@ target-version = ['py311']
 
 [tool.isort]
 profile = "black"
-
-authors = [
-  { name = "Marco Reverenna", email = "marcor@dtu.dk" }
-]
diff --git a/scripts/gridsearch.py b/scripts/gridsearch.py
index a5a418a..da67254 100755
--- a/scripts/gridsearch.py
+++ b/scripts/gridsearch.py
@@ -26,10 +26,9 @@
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from pathlib import Path
 
-from tqdm import tqdm
-
 from src.opt.opt_dbg import run_pipeline_dbg
 from src.opt.opt_greedy import run_pipeline_greedy
+from tqdm import tqdm
 
 BASE_DIR = Path(__file__).resolve().parents[2]
 
diff --git a/scripts/model_peptide_selector.py b/scripts/model_peptide_selector.py
index 57e60bf..7f9adf0 100644
--- a/scripts/model_peptide_selector.py
+++ b/scripts/model_peptide_selector.py
@@ -18,23 +18,24 @@
 """
 
 
-import re
 import json
-import pandas as pd
-import numpy as np
-import preprocessing as prep
+import re
 from math import log2
 from pathlib import Path
+
+import joblib
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import preprocessing as prep
+import seaborn as sns
 from sklearn.ensemble import RandomForestClassifier
-from sklearn.model_selection import train_test_split
 from sklearn.metrics import (
-    precision_recall_curve,
-    f1_score,
     average_precision_score,
+    f1_score,
+    precision_recall_curve,
 )
-import matplotlib.pyplot as plt
-import joblib
-import seaborn as sns
+from sklearn.model_selection import train_test_split
 
 # side meaning where the protease cleaves
 # residues meaning which amino acids it cleaves after (C-side) or before (N-side
diff --git a/src/instanexus/__main__.py b/src/instanexus/__main__.py
index 005b782..6d83bb4 100644
--- a/src/instanexus/__main__.py
+++ b/src/instanexus/__main__.py
@@ -1,7 +1,9 @@
-import sys
 import argparse
+import sys
+
 from . import script_dbg, script_greedy
 
+
 def main():
     banner = r"""
      ______                   __             __  __                                   
@@ -15,8 +17,9 @@ def main():
 
     parser = argparse.ArgumentParser(
         prog="instanexus",
-        description=(banner + "\n"
-            "InstaNexus CLI: de novo protein sequencing based on InstaNovo,\n\n" \
+        description=(
+            banner + "\n"
+            "InstaNexus CLI: de novo protein sequencing based on InstaNovo,\n\n"
             "an end-to-end workflow from de novo peptides to proteins\n\n"
             "Usage:\n"
             "  instanexus <command> [options]\n\n"
@@ -30,8 +33,8 @@ def main():
         ),
         formatter_class=argparse.RawTextHelpFormatter,
     )
-    
-    parser.add_argument('--version', action='version', version='InstaNexus 0.1.0'),
+
+    parser.add_argument("--version", action="version", version="InstaNexus 0.1.0"),
 
     subparsers = parser.add_subparsers(dest="command", help="subcommands")
 
@@ -50,5 +53,6 @@ def main():
     else:
         parser.print_help()
 
+
 if __name__ == "__main__":
     main()
diff --git a/src/instanexus/alignment.py b/src/instanexus/alignment.py
index 43438c1..fcc46b2 100755
--- a/src/instanexus/alignment.py
+++ b/src/instanexus/alignment.py
@@ -3,14 +3,14 @@
 r"""Alignment module for clustered scaffolds.
 
  ██████████   ███████████ █████  █████
-░░███░░░░███ ░█░░░███░░░█░░███  ░░███ 
- ░███   ░░███░   ░███  ░  ░███   ░███ 
- ░███    ░███    ░███     ░███   ░███ 
- ░███    ░███    ░███     ░███   ░███ 
- ░███    ███     ░███     ░███   ░███ 
- ██████████      █████    ░░████████  
-░░░░░░░░░░      ░░░░░      ░░░░░░░░   
-                          
+░░███░░░░███ ░█░░░███░░░█░░███  ░░███
+ ░███   ░░███░   ░███  ░  ░███   ░███
+ ░███    ░███    ░███     ░███   ░███
+ ░███    ░███    ░███     ░███   ░███
+ ░███    ███     ░███     ░███   ░███
+ ██████████      █████    ░░████████
+░░░░░░░░░░      ░░░░░      ░░░░░░░░
+
 __authors__ = Marco Reverenna
 __copyright__ = Copyright 2025-2026
 __research-group__ = DTU Biosustain (Multi-omics Network Analytics) and DTU Bioengineering
@@ -26,9 +26,12 @@
 import shutil
 import subprocess
 from pathlib import Path
+
 from Bio import SeqIO
 
-logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s"
+)
 logger = logging.getLogger(__name__)
 
 
@@ -49,16 +52,29 @@ def align_or_copy_fasta(fasta_file, output_file):
         logger.debug(f"Copied single-sequence file: {Path(fasta_file).name}")
     elif len(sequences) > 1:
         # Multiple sequences, run clustalo
-        logger.debug(f"Aligning {len(sequences)} sequences from {Path(fasta_file).name}...")
+        logger.debug(
+            f"Aligning {len(sequences)} sequences from {Path(fasta_file).name}..."
+        )
         try:
             subprocess.run(
-                ["clustalo", "-i", fasta_file, "-o", output_file, "--outfmt", "fa", "--force"],
+                [
+                    "clustalo",
+                    "-i",
+                    fasta_file,
+                    "-o",
+                    output_file,
+                    "--outfmt",
+                    "fa",
+                    "--force",
+                ],
                 check=True,
-                capture_output=True, # Suppress clustalo stdout
-                text=True
+                capture_output=True,  # Suppress clustalo stdout
+                text=True,
             )
         except FileNotFoundError:
-            logger.error("clustalo command not found. Please ensure it is in your system's PATH.")
+            logger.error(
+                "clustalo command not found. Please ensure it is in your system's PATH."
+            )
             raise
         except subprocess.CalledProcessError as e:
             logger.error(f"Clustalo failed for {fasta_file}: {e.stderr}")
@@ -71,23 +87,24 @@ def align_or_copy_fasta(fasta_file, output_file):
 def process_alignment(input_dir: str, output_dir: str):
     """
     Align all FASTA files from input_dir and save results in output_dir.
-    
+
     Args:
         input_dir (str): Path to the .../cluster_fasta/ directory.
         output_dir (str): Path to the .../alignment/ directory.
     """
     cluster_fasta_folder = Path(input_dir)
     alignment_folder = Path(output_dir)
-    
+
     alignment_folder.mkdir(parents=True, exist_ok=True)
 
     if not cluster_fasta_folder.exists():
         logger.error(f"Cluster FASTA folder not found: {cluster_fasta_folder}")
-        raise FileNotFoundError(f"Cluster FASTA folder not found: {cluster_fasta_folder}")
+        raise FileNotFoundError(
+            f"Cluster FASTA folder not found: {cluster_fasta_folder}"
+        )
 
     fasta_files_to_align = [
-        f for f in sorted(os.listdir(cluster_fasta_folder))
-        if f.endswith(".fasta")
+        f for f in sorted(os.listdir(cluster_fasta_folder)) if f.endswith(".fasta")
     ]
 
     logger.info(f"Found {len(fasta_files_to_align)} cluster FASTA files to align.")
@@ -95,15 +112,14 @@ def process_alignment(input_dir: str, output_dir: str):
     for fasta_file in fasta_files_to_align:
         fasta_path = cluster_fasta_folder / fasta_file
         # Save output as .afa (alignment FASTA)
-        output_path = alignment_folder / fasta_file.replace(".fasta", "_aligned.afa") 
-        
+        output_path = alignment_folder / fasta_file.replace(".fasta", "_aligned.afa")
+
         align_or_copy_fasta(fasta_path, output_path)
 
     logger.info("All alignment tasks completed.")
 
 
-def main(input_cluster_fasta_folder: str, 
-    output_alignment_folder: str):
+def main(input_cluster_fasta_folder: str, output_alignment_folder: str):
     """
     Main function to run the alignment script.
     """
@@ -113,10 +129,9 @@ def main(input_cluster_fasta_folder: str,
     logger.info(f"Output Folder (Alignments): {output_alignment_folder}")
 
     process_alignment(
-        input_dir=input_cluster_fasta_folder,
-        output_dir=output_alignment_folder
+        input_dir=input_cluster_fasta_folder, output_dir=output_alignment_folder
     )
-    
+
     logger.info("--- Step 4: Alignment Completed ---")
 
 
@@ -127,25 +142,26 @@ def cli():
     parser = argparse.ArgumentParser(
         description="Alignment script for clustered scaffolds."
     )
-    
+
     parser.add_argument(
         "--input-folder",
         type=str,
         required=True,
-        help="Path to the folder containing cluster FASTA files (e.g., .../cluster_fasta)."
+        help="Path to the folder containing cluster FASTA files (e.g., .../cluster_fasta).",
     )
     parser.add_argument(
         "--output-folder",
         type=str,
         required=True,
-        help="Path to the folder to save aligned .afa files (e.g., .../alignment)."
+        help="Path to the folder to save aligned .afa files (e.g., .../alignment).",
     )
-    
+
     args = parser.parse_args()
 
-    main(input_cluster_fasta_folder=args.input_folder,
-         output_alignment_folder=args.output_folder
-         )
+    main(
+        input_cluster_fasta_folder=args.input_folder,
+        output_alignment_folder=args.output_folder,
+    )
 
 
 if __name__ == "__main__":
@@ -153,4 +169,4 @@ def cli():
 
 # python -m instanexus.alignment \
 #     --input-folder outputs/bsa/scaffolds/clustering/cluster_fasta \
-#     --output-folder outputs/bsa/scaffolds/alignment
\ No newline at end of file
+#     --output-folder outputs/bsa/scaffolds/alignment
diff --git a/src/instanexus/assembly.py b/src/instanexus/assembly.py
index 6812367..e24bf62 100644
--- a/src/instanexus/assembly.py
+++ b/src/instanexus/assembly.py
@@ -3,14 +3,14 @@
 r"""Assembly module for InstaNexus.
 
  ██████████   ███████████ █████  █████
-░░███░░░░███ ░█░░░███░░░█░░███  ░░███ 
- ░███   ░░███░   ░███  ░  ░███   ░███ 
- ░███    ░███    ░███     ░███   ░███ 
- ░███    ░███    ░███     ░███   ░███ 
- ░███    ███     ░███     ░███   ░███ 
- ██████████      █████    ░░████████  
-░░░░░░░░░░      ░░░░░      ░░░░░░░░   
-                          
+░░███░░░░███ ░█░░░███░░░█░░███  ░░███
+ ░███   ░░███░   ░███  ░  ░███   ░███
+ ░███    ░███    ░███     ░███   ░███
+ ░███    ░███    ░███     ░███   ░███
+ ░███    ███     ░███     ░███   ░███
+ ██████████      █████    ░░████████
+░░░░░░░░░░      ░░░░░      ░░░░░░░░
+
 __authors__ = Marco Reverenna & Konstantinos Kalogeropoulus
 __copyright__ = Copyright 2024-2025
 __research-group__ = DTU Biosustain (Multi-omics Network Analytics) and DTU Bioengineering
@@ -20,34 +20,31 @@
 __status__ = Dev
 """
 
+import argparse
+
 # import libraries
 import logging
+from collections import Counter, defaultdict
+from dataclasses import dataclass
+from itertools import combinations
+from pathlib import Path
+from typing import Dict, Iterable, List
+
+import Bio
 import networkx as nx
 import pandas as pd
-import argparse
-import Bio
+from tqdm import tqdm
 
 from . import helpers
 from . import visualization as viz
 
-from tqdm import tqdm
-from pathlib import Path
-from Bio import SeqIO
-from collections import defaultdict
-from collections import Counter
-from itertools import combinations
-from dataclasses import dataclass
-from typing import List, Tuple, Dict, Any, Iterable, Optional
-
-
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 
-
 def find_peptide_overlaps(peptides, min_overlap):
     """Finds overlaps between peptide sequences using a greedy approach."""
-    overlaps = defaultdict(list) 
+    overlaps = defaultdict(list)
 
     for index_a, peptide_a in tqdm(enumerate(peptides), desc="Finding overlaps"):
         for index_b, peptide_b in enumerate(peptides):
@@ -136,14 +133,16 @@ def combine_seqs_into_scaffolds(contigs, min_overlap):
 
 def scaffold_iterative_greedy(contigs, min_overlap, size_threshold, disable_tqdm=False):
     """Iterative scaffolding using Greedy approach."""
+
     def clean(seqs):
         """Remove duplicates, filter by length, and sort by descending size."""
         seqs = list(set(seqs))
         seqs = [s for s in seqs if len(s) > size_threshold]
         return sorted(seqs, key=len, reverse=True)
+
     current = clean(contigs)
     prev = None
-    
+
     while prev != current:
         prev = current
         next_round = combine_seqs_into_scaffolds(current, min_overlap)
@@ -152,7 +151,7 @@ def clean(seqs):
         if next_round == current:
             break
         current = next_round
-    
+
     return current
 
 
@@ -160,7 +159,7 @@ def get_kmers(seqs, kmer_size):
     """Generate k-mers of specified length from input sequences."""
     kmers = []
     for seq in seqs:
-        kmers.extend(seq[i:i+kmer_size] for i in range(len(seq)-kmer_size+1))
+        kmers.extend(seq[i : i + kmer_size] for i in range(len(seq) - kmer_size + 1))
     return kmers
 
 
@@ -178,8 +177,7 @@ def get_kmer_counts(kmers):
 
 
 def get_debruijn_edges_from_kmers(kmers):
-    """Generate edges of a De Bruijn graph from a list of k-mers.
-    """
+    """Generate edges of a De Bruijn graph from a list of k-mers."""
     edges = set()
     k_1mers = defaultdict(set)
     for kmer in kmers:
@@ -226,9 +224,7 @@ def traverse_iterative(start_node):
 def find_overlaps(contigs, min_overlap, disable_tqdm=False):
     """Find overlaps between pairs of contigs based on specified minimum overlap."""
     overlaps = []
-    total_pairs = sum(
-        1 for _ in combinations(contigs, 2)
-    )
+    total_pairs = sum(1 for _ in combinations(contigs, 2))
     with tqdm(total=total_pairs, desc="Finding overlaps", disable=disable_tqdm) as pbar:
         for a, b in combinations(
             contigs, 2
@@ -284,7 +280,6 @@ def scaffold_iterative_dbg(contigs, min_overlap, size_threshold, disable_tqdm=Fa
     return sorted(current, key=len, reverse=True)
 
 
-
 def get_kmers(sequences: Iterable[str], kmer_size: int) -> List[str]:
     """Generate all k-mers from a list of sequences."""
     kmers = []
@@ -294,7 +289,7 @@ def get_kmers(sequences: Iterable[str], kmer_size: int) -> List[str]:
         L = len(seq)
         if L < kmer_size:
             continue
-        kmers.extend(seq[i:i+kmer_size] for i in range(L - kmer_size + 1))
+        kmers.extend(seq[i : i + kmer_size] for i in range(L - kmer_size + 1))
     return kmers
 
 
@@ -325,7 +320,9 @@ def build_dbg_from_kmers(kmers: Iterable[str]) -> nx.DiGraph:
 
 def filter_low_weight_edges(G: nx.DiGraph, min_weight: int = 2) -> nx.DiGraph:
     """Remove edges with weight < min_weight (light error correction)."""
-    to_remove = [(u, v) for u, v, d in G.edges(data=True) if d.get("weight", 0) < min_weight]
+    to_remove = [
+        (u, v) for u, v, d in G.edges(data=True) if d.get("weight", 0) < min_weight
+    ]
     G.remove_edges_from(to_remove)
     # drop isolated nodes
     iso = [n for n in G.nodes if G.in_degree(n) == 0 and G.out_degree(n) == 0]
@@ -335,9 +332,10 @@ def filter_low_weight_edges(G: nx.DiGraph, min_weight: int = 2) -> nx.DiGraph:
 
 @dataclass
 class ContigPath:
-    nodes: List[str]         # list of (k-1)-mer node labels in path order
-    seq: str                 # assembled sequence
-    weights: List[int]       # edge weights along the path
+    nodes: List[str]  # list of (k-1)-mer node labels in path order
+    seq: str  # assembled sequence
+    weights: List[int]  # edge weights along the path
+
 
 def _extend_linear_path(G: nx.DiGraph, start: str, succ: str) -> ContigPath:
     """Extend from start→succ while in/out-degree == 1 (unbranched)."""
@@ -405,13 +403,20 @@ class ContigScore:
     max_weight: int
     score: float
 
-def score_contig(cp: ContigPath, alpha_len: float = 1.0, alpha_cov: float = 1.0, alpha_min: float = 0.2) -> ContigScore:
+
+def score_contig(
+    cp: ContigPath,
+    alpha_len: float = 1.0,
+    alpha_cov: float = 1.0,
+    alpha_min: float = 0.2,
+) -> ContigScore:
     """
     Simple reference-free score combining length and coverage:
       score = alpha_len * log(length) + alpha_cov * mean_weight + alpha_min * min_weight
     Adjust alphas to your data. You can also plug-in intensity-based terms later.
     """
     import math
+
     if cp.weights:
         mean_w = sum(cp.weights) / len(cp.weights)
         min_w = min(cp.weights)
@@ -420,26 +425,37 @@ def score_contig(cp: ContigPath, alpha_len: float = 1.0, alpha_cov: float = 1.0,
         mean_w = min_w = max_w = 0.0
     L = len(cp.seq)
     composite = alpha_len * math.log(max(L, 2)) + alpha_cov * mean_w + alpha_min * min_w
-    return ContigScore(seq=cp.seq, length=L, mean_weight=mean_w, min_weight=min_w, max_weight=max_w, score=composite)
+    return ContigScore(
+        seq=cp.seq,
+        length=L,
+        mean_weight=mean_w,
+        min_weight=min_w,
+        max_weight=max_w,
+        score=composite,
+    )
 
 
-def rank_contigs_by_score(contigs: List[ContigPath],
-                          alpha_len: float = 1.0,
-                          alpha_cov: float = 1.0,
-                          alpha_min: float = 0.2) -> List[ContigScore]:
+def rank_contigs_by_score(
+    contigs: List[ContigPath],
+    alpha_len: float = 1.0,
+    alpha_cov: float = 1.0,
+    alpha_min: float = 0.2,
+) -> List[ContigScore]:
     scored = [score_contig(c, alpha_len, alpha_cov, alpha_min) for c in contigs]
     return sorted(scored, key=lambda s: (s.score, s.length), reverse=True)
 
 
-def scaffold_iterative_dbgx(seqs: List[str],
-                            kmer_size: int,
-                            size_threshold: int = 10,
-                            min_weight: int = 2,
-                            max_rounds: int = 5,
-                            patience: int = 2,
-                            alpha_len: float = 1.0,
-                            alpha_cov: float = 1.0,
-                            alpha_min: float = 0.2) -> List[str]:
+def scaffold_iterative_dbgx(
+    seqs: List[str],
+    kmer_size: int,
+    size_threshold: int = 10,
+    min_weight: int = 2,
+    max_rounds: int = 5,
+    patience: int = 2,
+    alpha_len: float = 1.0,
+    alpha_cov: float = 1.0,
+    alpha_min: float = 0.2,
+) -> List[str]:
     """
     Optional refinement:
       rebuild DBG from current contigs → collapse → filter by size → repeat
@@ -463,7 +479,9 @@ def scaffold_iterative_dbgx(seqs: List[str],
         seqs_new = [r.seq for r in ranked]
 
         # improvement heuristic: fewer contigs or longer top contig
-        improved = (len(seqs_new) < len(best)) or (seqs_new and best and len(seqs_new[0]) > len(best[0]))
+        improved = (len(seqs_new) < len(best)) or (
+            seqs_new and best and len(seqs_new[0]) > len(best[0])
+        )
         if improved:
             best = seqs_new
             no_improve = 0
@@ -494,7 +512,7 @@ def extend_path_dbg(G, contig, k, min_weight=1):
     # Extension forwards
     while extended:
         extended = False
-        suffix = seq[-(k-1):]
+        suffix = seq[-(k - 1) :]
         if suffix not in G or G.out_degree(suffix) == 0:
             break
 
@@ -522,7 +540,7 @@ def extend_path_dbg(G, contig, k, min_weight=1):
     extended = True
     while extended:
         extended = False
-        prefix = seq[:k-1]
+        prefix = seq[: k - 1]
         if prefix not in G or G.in_degree(prefix) == 0:
             break
 
@@ -551,22 +569,26 @@ def extend_path_dbg(G, contig, k, min_weight=1):
 class Assembler:
     """Unified assembler supporting 'greedy', 'dbg' and 'dbgx' (NetworkX) modes."""
 
-    def __init__(self,
-                 mode: str = "greedy",
-                 min_overlap: int = 4,
-                 size_threshold: int = 10,
-                 kmer_size: int = 6,
-                 min_identity: float = 0.8,
-                 max_mismatches: int = 10,
-                 # dbgx-specific:
-                 min_weight: int = 2,
-                 refine_rounds: int = 0,   # 0 = no refine, >0 enables iterative refine
-                 refine_patience: int = 2,
-                 alpha_len: float = 1.0,
-                 alpha_cov: float = 1.0,
-                 alpha_min: float = 0.2):
+    def __init__(
+        self,
+        mode: str = "greedy",
+        min_overlap: int = 4,
+        size_threshold: int = 10,
+        kmer_size: int = 6,
+        min_identity: float = 0.8,
+        max_mismatches: int = 10,
+        # dbgx-specific:
+        min_weight: int = 2,
+        refine_rounds: int = 0,  # 0 = no refine, >0 enables iterative refine
+        refine_patience: int = 2,
+        alpha_len: float = 1.0,
+        alpha_cov: float = 1.0,
+        alpha_min: float = 0.2,
+    ):
         if mode not in ["greedy", "dbg", "dbg_weighted", "dbgX", "fusion"]:
-            raise ValueError("mode must be 'greedy', 'dbg', 'dbg_weighted', 'dbgX' or 'fusion'")
+            raise ValueError(
+                "mode must be 'greedy', 'dbg', 'dbg_weighted', 'dbgX' or 'fusion'"
+            )
 
         self.mode = mode
         self.min_overlap = min_overlap
@@ -585,12 +607,16 @@ def __init__(self,
 
     def assemble_greedy(self, sequences):
 
-        logger.info(f"[Assembler] Running Greedy assembly (min_overlap={self.min_overlap})")
+        logger.info(
+            f"[Assembler] Running Greedy assembly (min_overlap={self.min_overlap})"
+        )
         contigs = assemble_contigs_greedy(sequences, self.min_overlap)
         contigs = list(set(contigs))
         contigs = sorted(contigs, key=len, reverse=True)
 
-        scaffolds =  scaffold_iterative_greedy(contigs, self.min_overlap, self.size_threshold)
+        scaffolds = scaffold_iterative_greedy(
+            contigs, self.min_overlap, self.size_threshold
+        )
 
         return scaffolds
 
@@ -604,13 +630,16 @@ def assemble_dbg(self, sequences):
         contigs = sorted(contigs, key=len, reverse=True)
         contigs = [seq for seq in contigs if len(seq) > self.size_threshold]
 
-        scaffolds = scaffold_iterative_dbg(contigs, self.min_overlap, self.size_threshold)
+        scaffolds = scaffold_iterative_dbg(
+            contigs, self.min_overlap, self.size_threshold
+        )
 
         return scaffolds
-    
 
     def assemble_dbg_weighted(self, sequences: List[str]) -> List[str]:
-        logger.info(f"[Assembler] Running DBG weighted (k={self.kmer_size}, min_weight={self.min_weight}, refine_rounds={self.refine_rounds})")
+        logger.info(
+            f"[Assembler] Running DBG weighted (k={self.kmer_size}, min_weight={self.min_weight}, refine_rounds={self.refine_rounds})"
+        )
 
         kmers = get_kmers(sequences, self.kmer_size)
         if not kmers:
@@ -625,7 +654,9 @@ def assemble_dbg_weighted(self, sequences: List[str]) -> List[str]:
             logger.warning("No contigs assembled from DBGX; returning empty result.")
             return []
 
-        ranked = rank_contigs_by_score(contigs_cp, self.alpha_len, self.alpha_cov, self.alpha_min)
+        ranked = rank_contigs_by_score(
+            contigs_cp, self.alpha_len, self.alpha_cov, self.alpha_min
+        )
         contigs = [r.seq for r in ranked]
 
         if self.refine_rounds and self.refine_rounds > 0:
@@ -644,7 +675,7 @@ def assemble_dbg_weighted(self, sequences: List[str]) -> List[str]:
         scaffolds = list(contigs)
 
         return scaffolds
-    
+
     def assemble_dbgX(self, sequences):
         logger.info(f"[Assembler] Running DBG-Extension (k={self.kmer_size})")
 
@@ -656,13 +687,15 @@ def assemble_dbgX(self, sequences):
         contigs = [c.seq for c in contigs_cp]
 
         logger.info("Extending contigs using DBG paths (coverage-aware)...")
-        extended_contigs = [extend_path_dbg(G, c, self.kmer_size, self.min_weight) for c in contigs]
+        extended_contigs = [
+            extend_path_dbg(G, c, self.kmer_size, self.min_weight) for c in contigs
+        ]
         extended_contigs = sorted(set(extended_contigs), key=len, reverse=True)
 
         return extended_contigs
-    
+
     def assemble_fusion(self, sequences):
-        logger.info(f"[Assembler] Running FUSION (DBG weighted + greedy merge)")
+        logger.info("[Assembler] Running FUSION (DBG weighted + greedy merge)")
 
         contigs_dbg_weighted = self.assemble_dbg_weighted(sequences)
 
@@ -681,12 +714,11 @@ def assemble_fusion(self, sequences):
 
         return fused
 
-
     def run(self, sequences: List[str]):
         if not sequences:
             logger.error("No valid sequences provided for assembly.")
             raise ValueError("Input sequences list is empty.")
-        
+
         if self.mode == "greedy":
             return self.assemble_greedy(sequences)
         elif self.mode == "dbg":
@@ -708,29 +740,33 @@ def main(
     min_overlap: int,
     size_threshold: int,
     reference: bool,
-    chain: str,    
+    chain: str,
     min_identity: float,
     max_mismatches: int,
 ):
     """Main function for standalone assembly."""
 
-    protein_norm = None # None means no reference mode
+    protein_norm = None  # None means no reference mode
     if reference:
-        logger.info(f"Reference mode enabled. Loading reference protein...")
+        logger.info("Reference mode enabled. Loading reference protein...")
         if not metadata_json_path:
-            raise ValueError("metadata_json_path is required when reference mode is enabled.")
-        
+            raise ValueError(
+                "metadata_json_path is required when reference mode is enabled."
+            )
+
         try:
-            run_name = Path(input_csv_path).stem # extract run name from input file
-            meta = helpers.get_sample_metadata(run=run_name, chain=chain, json_path=metadata_json_path)
-            protein= meta["protein"]
+            run_name = Path(input_csv_path).stem  # extract run name from input file
+            meta = helpers.get_sample_metadata(
+                run=run_name, chain=chain, json_path=metadata_json_path
+            )
+            protein = meta["protein"]
             protein_norm = helpers.get_normalized_protein(protein)
             logger.info("Reference protein loaded and normalized successfully.")
-        
+
         except Exception as e:
             logger.error(f"Failed to get reference protein: {e}")
             logger.warning("Disabling reference mode.")
-            reference = False # if fails, disable reference mode
+            reference = False  # if fails, disable reference mode
 
     input_data = Path(input_csv_path)
 
@@ -750,10 +786,10 @@ def main(
         size_threshold=size_threshold,
         kmer_size=kmer_size,
         min_identity=min_identity,
-        max_mismatches=max_mismatches
+        max_mismatches=max_mismatches,
     )
 
-    scaffolds = assembler.run(sequences = sequences)
+    scaffolds = assembler.run(sequences=sequences)
 
     output_path = Path(output_scaffolds_path)
     output_folder = output_path.parent.mkdir(parents=True, exist_ok=True)
@@ -771,7 +807,9 @@ def main(
         "fasta",
     )
 
-    logger.info(f"Assembly completed — {len(scaffolds)} scaffolds saved to {output_path}")
+    logger.info(
+        f"Assembly completed — {len(scaffolds)} scaffolds saved to {output_path}"
+    )
 
     if protein_norm:
         logger.info("Reference mode: calculating statistics...")
@@ -782,7 +820,7 @@ def main(
             assembled_contigs=scaffolds,
             target_protein=protein_norm,
             max_mismatches=max_mismatches,
-            min_identity=min_identity, 
+            min_identity=min_identity,
         )
         df_scaffolds_mapped = viz.create_dataframe_from_mapped_sequences(
             data=mapped_scaffolds
@@ -817,7 +855,7 @@ def cli():
     parser.add_argument(
         "--metadata-json-path",
         type=str,
-        default=None, # Optional, but required by --reference
+        default=None,  # Optional, but required by --reference
         help="Path to sample_metadata.json (required for --reference).",
     )
     parser.add_argument(
@@ -855,19 +893,19 @@ def cli():
         "--chain",
         type=str,
         default="",
-        help="Specify chain type (light/heavy) required for reference lookup."
+        help="Specify chain type (light/heavy) required for reference lookup.",
     )
     parser.add_argument(
         "--min-identity",
         type=float,
         default=0.8,
-        help="Minimum identity for reference mapping."
+        help="Minimum identity for reference mapping.",
     )
     parser.add_argument(
         "--max-mismatches",
         type=int,
         default=10,
-        help="Maximum mismatches for reference mapping."
+        help="Maximum mismatches for reference mapping.",
     )
 
     args = parser.parse_args()
@@ -876,7 +914,7 @@ def cli():
     if args.assembly_mode != "dbg":
         args.kmer_size = 0
         logger.info("Ignoring kmer_size (used only for DBG mode).")
-    
+
     if args.reference and not args.metadata_json_path:
         parser.error("--metadata-json-path is required when --reference is enabled.")
 
diff --git a/src/instanexus/clustering.py b/src/instanexus/clustering.py
index c7d0633..f5bf631 100755
--- a/src/instanexus/clustering.py
+++ b/src/instanexus/clustering.py
@@ -3,14 +3,14 @@
 r"""Clustering module for assembled scaffolds.
 
  ██████████   ███████████ █████  █████
-░░███░░░░███ ░█░░░███░░░█░░███  ░░███ 
- ░███   ░░███░   ░███  ░  ░███   ░███ 
- ░███    ░███    ░███     ░███   ░███ 
- ░███    ░███    ░███     ░███   ░███ 
- ░███    ███     ░███     ░███   ░███ 
- ██████████      █████    ░░████████  
-░░░░░░░D░░      ░░░░░      ░░░░░░░░   
-     
+░░███░░░░███ ░█░░░███░░░█░░███  ░░███
+ ░███   ░░███░   ░███  ░  ░███   ░███
+ ░███    ░███    ░███     ░███   ░███
+ ░███    ░███    ░███     ░███   ░███
+ ░███    ███     ░███     ░███   ░███
+ ██████████      █████    ░░████████
+░░░░░░░D░░      ░░░░░      ░░░░░░░░
+
 __authors__ = Marco Reverenna & Konstantinos Kalogeropoulus
 __copyright__ = Copyright 2025-2026
 __research-group__ = DTU Biosustain (Multi-omics Network Analytics) and DTU Bioengineering
@@ -24,25 +24,28 @@
 import logging
 import shutil
 import subprocess
-import pandas as pd
 from pathlib import Path
 from tempfile import mkdtemp
+
+import pandas as pd
 from Bio import SeqIO
 from tqdm import tqdm
 
-logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s"
+)
 logger = logging.getLogger(__name__)
 
 
 def cluster_fasta_files(
-    scaffolds_folder: Path, 
-    clustering_dir: Path, 
-    min_seq_id: float = 0.85, 
-    coverage: float = 0.8
+    scaffolds_folder: Path,
+    clustering_dir: Path,
+    min_seq_id: float = 0.85,
+    coverage: float = 0.8,
 ):
     """
     Runs mmseqs easy-cluster on the scaffolds.fasta file.
-    
+
     This is based on the user's provided function, but modified to be
     more robust by targeting only 'scaffolds.fasta' instead of looping.
     """
@@ -51,7 +54,7 @@ def cluster_fasta_files(
     cluster_fasta_dir.mkdir(exist_ok=True)
 
     temp_dir = mkdtemp(prefix="mmseqs-")
-    
+
     fasta_path = scaffolds_folder / "scaffolds.fasta"
     if not fasta_path.is_file():
         logger.error(f"Input file not found, skipping clustering: {fasta_path}")
@@ -124,22 +127,26 @@ def process_fasta_and_clusters(fasta_file: Path, scaffolds_folder: Path):
         return
 
     try:
-        cluster_df = pd.read_csv(cluster_tsv, sep="\t", header=None, names=["cluster", "contig"])
+        cluster_df = pd.read_csv(
+            cluster_tsv, sep="\t", header=None, names=["cluster", "contig"]
+        )
     except pd.errors.EmptyDataError:
-        logger.warning(f"Cluster TSV file is empty: {cluster_tsv}. No clusters to process.")
+        logger.warning(
+            f"Cluster TSV file is empty: {cluster_tsv}. No clusters to process."
+        )
         return
-        
+
     records = list(SeqIO.parse(fasta_file, "fasta"))
-    records_dict = SeqIO.to_dict(records) # Faster lookup
+    records_dict = SeqIO.to_dict(records)  # Faster lookup
     clusters = cluster_df["cluster"].unique()
 
     logger.info(f"Splitting {len(records)} sequences into {len(clusters)} clusters.")
-    
-    width = max(4, len(str(len(clusters)))) # For zfill padding
+
+    width = max(4, len(str(len(clusters))))  # For zfill padding
 
     for i, cluster_id in enumerate(tqdm(clusters, desc="Processing clusters")):
         contig_ids = cluster_df[cluster_df["cluster"] == cluster_id]["contig"].values
-        
+
         contig_records = []
         for contig_id in contig_ids:
             if contig_id in records_dict:
@@ -153,16 +160,12 @@ def process_fasta_and_clusters(fasta_file: Path, scaffolds_folder: Path):
     logger.info(f"All cluster FASTA files created in: {cluster_fasta_dir}")
 
 
-def main(
-    input_scaffolds_folder: str,
-    min_seq_id: float,
-    coverage: float
-):
+def main(input_scaffolds_folder: str, min_seq_id: float, coverage: float):
     """
     Main function to run the clustering script.
     """
     logger.info("--- Starting Step 3: Clustering ---")
-    
+
     scaffolds_folder_path = Path(input_scaffolds_folder)
     clustering_out_path = scaffolds_folder_path / "clustering"
     fasta_input_path = scaffolds_folder_path / "scaffolds.fasta"
@@ -172,25 +175,27 @@ def main(
 
     if not scaffolds_folder_path.exists():
         logger.error(f"Scaffolds folder does not exist: {scaffolds_folder_path}")
-        raise FileNotFoundError(f"Scaffolds folder does not exist: {scaffolds_folder_path}")
-    
+        raise FileNotFoundError(
+            f"Scaffolds folder does not exist: {scaffolds_folder_path}"
+        )
+
     logger.info("Running cluster_fasta_files...")
     cluster_fasta_files(
         scaffolds_folder=scaffolds_folder_path,
         clustering_dir=clustering_out_path,
         min_seq_id=min_seq_id,
-        coverage=coverage
+        coverage=coverage,
     )
-    
+
     # --- Step 2: Process the cluster results (split FASTA) ---
     logger.info("Running process_fasta_and_clusters...")
     process_fasta_and_clusters(
-        fasta_file=fasta_input_path,
-        scaffolds_folder=scaffolds_folder_path
+        fasta_file=fasta_input_path, scaffolds_folder=scaffolds_folder_path
     )
-    
+
     logger.info("--- Step 3: Clustering Completed ---")
 
+
 def cli():
     """
     Command-line interface (CLI) for the clustering script.
@@ -198,35 +203,36 @@ def cli():
     parser = argparse.ArgumentParser(
         description="Clustering script for assembled scaffolds."
     )
-    
+
     parser.add_argument(
         "--input-scaffolds-folder",
         type=str,
         required=True,
-        help="Path to the folder containing 'scaffolds.fasta' (e.g., 'outputs/bsa/scaffolds')."
+        help="Path to the folder containing 'scaffolds.fasta' (e.g., 'outputs/bsa/scaffolds').",
     )
     parser.add_argument(
         "--min-seq-id",
         type=float,
         default=0.85,
-        help="Minimum sequence identity for mmseqs (default: 0.85)."
+        help="Minimum sequence identity for mmseqs (default: 0.85).",
     )
     parser.add_argument(
         "--coverage",
         type=float,
         default=0.8,
-        help="Coverage parameter (-c) for mmseqs (default: 0.8)."
+        help="Coverage parameter (-c) for mmseqs (default: 0.8).",
     )
-    
+
     args = parser.parse_args()
-    
+
     main(
         input_scaffolds_folder=args.input_scaffolds_folder,
         min_seq_id=args.min_seq_id,
-        coverage=args.coverage
+        coverage=args.coverage,
     )
 
+
 if __name__ == "__main__":
     cli()
 
-# python -m instanexus/clustering --input-scaffolds-folder outputs/bsa/scaffolds --min-seq-id 0.85 --coverage 0.8
\ No newline at end of file
+# python -m instanexus/clustering --input-scaffolds-folder outputs/bsa/scaffolds --min-seq-id 0.85 --coverage 0.8
diff --git a/src/instanexus/consensus.py b/src/instanexus/consensus.py
index d8cfa14..5b81e32 100755
--- a/src/instanexus/consensus.py
+++ b/src/instanexus/consensus.py
@@ -3,14 +3,14 @@
 r"""Consensus generation module for aligned scaffolds.
 
  ██████████   ███████████ █████  █████
-░░███░░░░███ ░█░░░███░░░█░░███  ░░███ 
- ░███   ░░███░   ░███  ░  ░███   ░███ 
- ░███    ░███    ░███     ░███   ░███ 
- ░███    ░███    ░███     ░███   ░███ 
- ░███    ███     ░███     ░███   ░███ 
- ██████████      █████    ░░████████  
-░░░░░░░░░░      ░░░░░      ░░░░░░░░   
-                          
+░░███░░░░███ ░█░░░███░░░█░░███  ░░███
+ ░███   ░░███░   ░███  ░  ░███   ░███
+ ░███    ░███    ░███     ░███   ░███
+ ░███    ░███    ░███     ░███   ░███
+ ░███    ███     ░███     ░███   ░███
+ ██████████      █████    ░░████████
+░░░░░░░░░░      ░░░░░      ░░░░░░░░
+
 __authors__ = Marco Reverenna
 __copyright__ = Copyright 2025-2026
 __research-group__ = DTU Biosustain (Multi-omics Network Analytics) and DTU Bioengineering
@@ -21,28 +21,28 @@
 """
 
 import argparse
+import json
 import logging
 import os
-import json
 import re
 import statistics
-from tqdm import tqdm
-
-from pathlib import Path
 from collections import Counter
-from Bio import SeqIO
-from Bio.Seq import Seq
-from Bio.SeqRecord import SeqRecord
+from pathlib import Path
 
-import Bio.SeqRecord 
-import pandas as pd
+import Bio.SeqRecord
+import logomaker
 import matplotlib.pyplot as plt
+import pandas as pd
 import seaborn as sns
-import logomaker
+from Bio import SeqIO
+from tqdm import tqdm
 
-logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s"
+)
 logger = logging.getLogger(__name__)
 
+
 def generate_pssm(aligned_records):
     """Generates a Position-Specific Scoring Matrix (PSSM) from aligned sequences."""
     pssm = {}
@@ -52,17 +52,17 @@ def generate_pssm(aligned_records):
                 pssm[i] = Counter()
             if aa != "-":  # Ignore gaps
                 pssm[i][aa] += 1
-    
+
     # Normalize to frequencies
     for i in pssm:
         total = sum(pssm[i].values())
         if total > 0:
             for aa in pssm[i]:
                 pssm[i][aa] /= total
-                
+
     pssm_df = pd.DataFrame(pssm).fillna(0).T
     pssm_df.index = pssm_df.index + 1  # 1-based indexing for positions
-    pssm_df = pssm_df.sort_index(axis=1) # Sort columns alphabetically (A, C, D...)
+    pssm_df = pssm_df.sort_index(axis=1)  # Sort columns alphabetically (A, C, D...)
     return pssm_df
 
 
@@ -80,33 +80,33 @@ def generate_consensus(pssm_df, threshold=0.6):
 def plot_heatmap2(pssm_df, output_file):
     """Generates and saves a Seaborn heatmap from a PSSM."""
     df_t = pssm_df.T
-    
+
     max_fig_width = 50
-    fig_width = min(len(pssm_df) / 1.5, max_fig_width) 
-    fig_height = max(5, len(df_t) / 4) 
+    fig_width = min(len(pssm_df) / 1.5, max_fig_width)
+    fig_height = max(5, len(df_t) / 4)
 
     fig, ax = plt.subplots(figsize=(fig_width, fig_height))
-    
+
     sns.heatmap(
         df_t,
         ax=ax,
         cmap="Reds",
         linewidths=0.1,
-        linecolor='lightgrey',
-        cbar_kws={'label': 'Frequency'}
+        linecolor="lightgrey",
+        cbar_kws={"label": "Frequency"},
     )
-    
+
     tick_positions = list(range(0, df_t.shape[1], 5))
-    tick_labels = [str(t+1) for t in tick_positions] 
-    ax.set_xticks([t + 0.5 for t in tick_positions]) 
+    tick_labels = [str(t + 1) for t in tick_positions]
+    ax.set_xticks([t + 0.5 for t in tick_positions])
     ax.set_xticklabels(tick_labels, rotation=0)
 
     ax.set_xlabel("Position", fontsize=14)
     ax.set_ylabel("Amino Acid", fontsize=14)
-    
+
     plt.tight_layout()
     plt.savefig(output_file, format="svg", dpi=150)
-    plt.close(fig) 
+    plt.close(fig)
 
 
 def plot_logo2(pssm_df, output_file):
@@ -114,7 +114,7 @@ def plot_logo2(pssm_df, output_file):
     max_fig_width = 50
     fig_width = min(len(pssm_df) / 1.5, max_fig_width)
     fig, ax = plt.subplots(figsize=[fig_width, 3])
-    
+
     logo = logomaker.Logo(
         pssm_df,
         ax=ax,
@@ -131,7 +131,7 @@ def plot_logo2(pssm_df, output_file):
         width=0.85,
         baseline_width=0.5,
     )
-    
+
     logo.style_xticks(anchor=0, rotation=0, spacing=1, fontsize=16, ha="center")
     ax.set_yticks([0, 0.5, 1])
     ax.set_yticklabels([0, 0.5, 1], fontsize=16)
@@ -165,15 +165,15 @@ def run_consensus_generation(align_folder: str, output_folder: str, run_id: str
     logo_dir.mkdir(exist_ok=True)
 
     alignment_files = [f for f in sorted(os.listdir(align_path)) if f.endswith(".afa")]
-    
+
     logger.info(f"Found {len(alignment_files)} aligned .afa files.")
-    
+
     for alignment_file in tqdm(
         alignment_files,
         desc=f"[{run_id}] Generating Consensus" if run_id else "Generating Consensus",
     ):
         alignment_path = align_path / alignment_file
-        base_filename = alignment_path.stem # e.g., 'scaffold_0001_aligned'
+        base_filename = alignment_path.stem  # e.g., 'scaffold_0001_aligned'
 
         aligned_records = list(SeqIO.parse(alignment_path, "fasta"))
         if not aligned_records:
@@ -206,7 +206,7 @@ def generate_consensus_stats(consensus_base_folder):
     """Calculates statistics on the generated consensus FASTA files."""
     consensus_base_folder = Path(consensus_base_folder)
     fasta_files = list(consensus_base_folder.glob("*_consensus.fasta"))
-    
+
     if not fasta_files:
         logger.warning("No consensus FASTA files found, skipping stats.")
         return
@@ -214,26 +214,28 @@ def generate_consensus_stats(consensus_base_folder):
     lengths = []
     gap_lengths_all = []
     sequences_without_gaps = 0
-    
+
     for fasta_path in fasta_files:
         record = next(SeqIO.parse(fasta_path, "fasta"))
         seq = str(record.seq)
         seq_len = len(seq)
         lengths.append(seq_len)
-        
+
         if "-" not in seq:
             sequences_without_gaps += 1
         else:
             gap_lengths = [len(g.group()) for g in re.finditer(r"-+", seq)]
             gap_lengths_all.extend(gap_lengths)
-            
+
     longest_gap = max(gap_lengths_all) if gap_lengths_all else 0
     shortest_gap = min(gap_lengths_all) if gap_lengths_all else 0
-    percent_no_gaps = (sequences_without_gaps / len(fasta_files) * 100) if fasta_files else 0
+    percent_no_gaps = (
+        (sequences_without_gaps / len(fasta_files) * 100) if fasta_files else 0
+    )
     max_length = max(lengths) if lengths else 0
     min_length = min(lengths) if lengths else 0
     avg_length = statistics.mean(lengths) if lengths else 0
-    
+
     stats = {
         "n_consensus_files": len(fasta_files),
         "longest_gap": longest_gap,
@@ -243,18 +245,14 @@ def generate_consensus_stats(consensus_base_folder):
         "min_consensus_length": min_length,
         "avg_consensus_length": round(avg_length, 2),
     }
-    
+
     stats_path = consensus_base_folder.parent / "consensus_stats.json"
     with open(stats_path, "w") as f:
         json.dump(stats, f, indent=4)
     logger.info(f"Consensus statistics saved to: {stats_path}")
 
 
-def main(
-    input_alignment_folder: str, 
-    output_consensus_folder: str,
-    run_id: str = ""
-):
+def main(input_alignment_folder: str, output_consensus_folder: str, run_id: str = ""):
     """
     Main function to run the consensus generation script.
     """
@@ -271,16 +269,14 @@ def main(
     consensus_fasta_dir = run_consensus_generation(
         align_folder=str(align_folder_in),
         output_folder=str(consensus_folder_out),
-        run_id=run_id
+        run_id=run_id,
     )
-    
+
     # --- Step 2: Generate statistics on the consensus files ---
     if consensus_fasta_dir:
         logger.info("Running consensus statistics generation...")
-        generate_consensus_stats(
-            consensus_base_folder=consensus_fasta_dir
-        )
-    
+        generate_consensus_stats(consensus_base_folder=consensus_fasta_dir)
+
     logger.info("--- Step 5: Consensus Generation Completed ---")
 
 
@@ -291,37 +287,38 @@ def cli():
     parser = argparse.ArgumentParser(
         description="Consensus generation script for aligned scaffolds."
     )
-    
+
     parser.add_argument(
         "--input-folder",
         type=str,
         required=True,
-        help="Path to the folder containing aligned .afa files (e.g., .../alignment)."
+        help="Path to the folder containing aligned .afa files (e.g., .../alignment).",
     )
     parser.add_argument(
         "--output-folder",
         type=str,
         required=True,
-        help="Path to the folder to save consensus outputs (e.g., .../consensus)."
+        help="Path to the folder to save consensus outputs (e.g., .../consensus).",
     )
     parser.add_argument(
         "--run-id",
         type=str,
         default="",
-        help="Optional ID to display in the progress bar."
+        help="Optional ID to display in the progress bar.",
     )
-    
+
     args = parser.parse_args()
-    
+
     main(
         input_alignment_folder=args.input_folder,
         output_consensus_folder=args.output_folder,
-        run_id=args.run_id
+        run_id=args.run_id,
     )
 
+
 if __name__ == "__main__":
     cli()
 
 # python -m instanexus.consensus \
 #     --input-folder outputs/bsa/scaffolds/alignment \
-#     --output-folder outputs/bsa/scaffolds/consensus
\ No newline at end of file
+#     --output-folder outputs/bsa/scaffolds/consensus
diff --git a/src/instanexus/helpers.py b/src/instanexus/helpers.py
index 747f443..6bd6a5f 100644
--- a/src/instanexus/helpers.py
+++ b/src/instanexus/helpers.py
@@ -3,14 +3,14 @@
 r"""
 
  ██████████   ███████████ █████  █████
-░░███░░░░███ ░█░░░███░░░█░░███  ░░███ 
- ░███   ░░███░   ░███  ░  ░███   ░███ 
- ░███    ░███    ░███     ░███   ░███ 
- ░███    ░███    ░███     ░███   ░███ 
- ░███    ███     ░███     ░███   ░███ 
- ██████████      █████    ░░████████  
-░░░░░░░░░░      ░░░░░      ░░░░░░░░   
-                          
+░░███░░░░███ ░█░░░███░░░█░░███  ░░███
+ ░███   ░░███░   ░███  ░  ░███   ░███
+ ░███    ░███    ░███     ░███   ░███
+ ░███    ░███    ░███     ░███   ░███
+ ░███    ███     ░███     ░███   ░███
+ ██████████      █████    ░░████████
+░░░░░░░░░░      ░░░░░      ░░░░░░░░
+
 __authors__ = Marco Reverenna
 __copyright__ = Copyright 2025-2026
 __research-group__ = DTU Biosustain (Multi-omics Network Analytics) and DTU Bioengineering
@@ -20,15 +20,11 @@
 __status__ = Dev
 """
 
-# import libraries
-import numpy as np
-import pandas as pd
-import os
 import json
-import plotly.express as px
-import plotly.graph_objects as go
+import os
 from pathlib import Path
 
+# import libraries
 
 PROJECT_ROOT = Path(__file__).resolve().parents[2]
 JSON_DIR = PROJECT_ROOT / "json"
@@ -163,4 +159,4 @@ def compute_assembly_statistics(df, sequence_type, output_folder, reference, **p
     with open(output_path, "w") as file:
         json.dump(statistics, file, indent=4)
 
-    return statistics
\ No newline at end of file
+    return statistics
diff --git a/src/instanexus/main.py b/src/instanexus/main.py
index 72d7fc5..8720ed4 100644
--- a/src/instanexus/main.py
+++ b/src/instanexus/main.py
@@ -3,14 +3,14 @@
 r"""Pipeline script for InstaNexus.
 
  ██████████   ███████████ █████  █████
-░░███░░░░███ ░█░░░███░░░█░░███  ░░███ 
- ░███   ░░███░   ░███  ░  ░███   ░███ 
- ░███    ░███    ░███     ░███   ░███ 
- ░███    ░███    ░███     ░███   ░███ 
- ░███    ███     ░███     ░███   ░███ 
- ██████████      █████    ░░████████  
-░░░░░░░░░░      ░░░░░      ░░░░░░░░   
-                          
+░░███░░░░███ ░█░░░███░░░█░░███  ░░███
+ ░███   ░░███░   ░███  ░  ░███   ░███
+ ░███    ░███    ░███     ░███   ░███
+ ░███    ░███    ░███     ░███   ░███
+ ░███    ███     ░███     ░███   ░███
+ ██████████      █████    ░░████████
+░░░░░░░░░░      ░░░░░      ░░░░░░░░
+
 __authors__ = Marco Reverenna
 __copyright__ = Copyright 2025-2026
 __research-group__ = DTU Biosustain (Multi-omics Network Analytics) and DTU Bioengineering
@@ -23,14 +23,13 @@
 import argparse
 import logging
 from pathlib import Path
-from . import preprocessing
-from . import assembly 
-from . import clustering
-from . import alignment
-from . import consensus
+
+from . import alignment, assembly, clustering, consensus, preprocessing
 
 # Setup logging
-logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s"
+)
 logging.getLogger("instanexus.preprocessing").setLevel(logging.WARNING)
 logging.getLogger("instanexus.assembly").setLevel(logging.WARNING)
 logging.getLogger("instanexus.clustering").setLevel(logging.WARNING)
@@ -38,13 +37,14 @@
 logging.getLogger("instanexus.consensus").setLevel(logging.WARNING)
 logger = logging.getLogger(__name__)
 
+
 def cli():
     """Command-line interface for the master pipeline."""
-    
+
     parser = argparse.ArgumentParser(
         description="Run the full InstaNexus preprocessing and assembly pipeline."
     )
-    
+
     # --- INPUT ARGUMENTS ---
     parser.add_argument(
         "--input-csv",
@@ -62,13 +62,13 @@ def cli():
         "--metadata-json-path",
         type=str,
         required=True,
-        help="Path to the sample_metadata.json file (required by preprocessing and assembly)."
+        help="Path to the sample_metadata.json file (required by preprocessing and assembly).",
     )
     parser.add_argument(
         "--contaminants-fasta-path",
         type=str,
         required=True,
-        help="Path to the contaminants.fasta file (required by preprocessing)."
+        help="Path to the contaminants.fasta file (required by preprocessing).",
     )
     parser.add_argument(
         "--chain",
@@ -84,7 +84,7 @@ def cli():
     parser.add_argument(
         "--conf",
         type=float,
-        default=None, # Default to None for preprocessing logic
+        default=None,  # Default to None for preprocessing logic
         help="Confidence threshold for filtering (optional).",
     )
     parser.add_argument(
@@ -128,17 +128,17 @@ def cli():
         "--min-seq-id",
         type=float,
         default=0.85,
-        help="Minimum sequence identity for mmseqs (default: 0.85)."
+        help="Minimum sequence identity for mmseqs (default: 0.85).",
     )
     parser.add_argument(
         "--coverage",
         type=float,
         default=0.8,
-        help="Coverage parameter (-c) for mmseqs (default: 0.8)."
+        help="Coverage parameter (-c) for mmseqs (default: 0.8).",
     )
-    
+
     args = parser.parse_args()
-    
+
     # Run the pipeline with the validated arguments
     run_pipeline(args)
 
@@ -147,21 +147,21 @@ def run_pipeline(args):
     """
     Orchestrates the full pipeline, calling each refactored module.
     """
-    
+
     logger.info("--- InstaNexus Pipeline started ---")
-    
-    run_name = Path(args.input_csv).stem # e.g., 'bsa'
-    base_output_folder = Path(args.folder_outputs) / run_name # e.g., 'outputs/bsa'
-    
+
+    run_name = Path(args.input_csv).stem  # e.g., 'bsa'
+    base_output_folder = Path(args.folder_outputs) / run_name  # e.g., 'outputs/bsa'
+
     # Build the experiment folder name based on parameters
     folder_name_parts = [f"{args.assembly_mode}"]
-    
+
     if args.conf is not None:
         folder_name_parts.append(f"c{args.conf}")
-    
+
     if "dbg" in args.assembly_mode:
         folder_name_parts.append(f"ks{args.kmer_size}")
-    
+
     folder_name_parts.append(f"mo{args.min_overlap}")
     folder_name_parts.append(f"ts{args.size_threshold}")
 
@@ -169,29 +169,30 @@ def run_pipeline(args):
         folder_name_parts.extend([f"mi{args.min_identity}", f"mm{args.max_mismatches}"])
 
     run_folder_name = "_".join(folder_name_parts)
-    experiment_folder = base_output_folder / run_folder_name # e.g., 'outputs/bsa/greedy_c0.9_mo4_ts10'
-    
+    experiment_folder = (
+        base_output_folder / run_folder_name
+    )  # e.g., 'outputs/bsa/greedy_c0.9_mo4_ts10'
+
     cleaned_csv_path = experiment_folder / "cleaned.csv"
-    
+
     scaffolds_folder = experiment_folder / "scaffolds"
     scaffolds_fasta_path = scaffolds_folder / "scaffolds.fasta"
-    
-    clustering_folder = scaffolds_folder / "clustering" # Clustering output
-    cluster_fasta_folder = clustering_folder / "cluster_fasta" # Input for alignment
-    
+
+    clustering_folder = scaffolds_folder / "clustering"  # Clustering output
+    cluster_fasta_folder = clustering_folder / "cluster_fasta"  # Input for alignment
+
     alignment_folder = scaffolds_folder / "alignment"
-    
+
     consensus_folder = scaffolds_folder / "consensus"
 
     # ID for logs (optional)
     run_id_str = f"[{run_name} @ {run_folder_name}]"
-    
+
     logger.info(f"Starting pipeline for run: {run_id_str}")
     logger.info(f"All results will be saved to: {experiment_folder}")
 
-
     try:
-        logger.info(f"--- [Step 1/5] Running Preprocessing ---")
+        logger.info("--- [Step 1/5] Running Preprocessing ---")
         preprocessing.main(
             input_csv=args.input_csv,
             metadata_json=args.metadata_json_path,
@@ -199,14 +200,14 @@ def run_pipeline(args):
             chain=args.chain,
             reference=args.reference,
             conf=args.conf,
-            output_csv_path=str(cleaned_csv_path) # Pass explicit path
+            output_csv_path=str(cleaned_csv_path),  # Pass explicit path
         )
     except Exception as e:
         logger.error(f"Preprocessing failed: {e}")
-        return 
+        return
 
     try:
-        logger.info(f"--- [Step 2/5] Running Assembly ---")
+        logger.info("--- [Step 2/5] Running Assembly ---")
         assembly.main(
             input_csv_path=str(cleaned_csv_path),
             output_scaffolds_path=str(scaffolds_fasta_path),
@@ -218,45 +219,45 @@ def run_pipeline(args):
             reference=args.reference,
             chain=args.chain,
             min_identity=args.min_identity,
-            max_mismatches=args.max_mismatches
+            max_mismatches=args.max_mismatches,
         )
     except Exception as e:
         logger.error(f"Assembly failed: {e}")
         return
 
     try:
-        logger.info(f"--- [Step 3/5] Running Clustering ---")
+        logger.info("--- [Step 3/5] Running Clustering ---")
         clustering.main(
-            input_scaffolds_folder=str(scaffolds_folder), # Pass the folder
+            input_scaffolds_folder=str(scaffolds_folder),  # Pass the folder
             min_seq_id=args.min_seq_id,
-            coverage=args.coverage
+            coverage=args.coverage,
         )
     except Exception as e:
         logger.error(f"Clustering failed: {e}")
         return
-    
+
     try:
-        logger.info(f"--- [Step 4/5] Running Alignment ---")
+        logger.info("--- [Step 4/5] Running Alignment ---")
         alignment.main(
             input_cluster_fasta_folder=str(cluster_fasta_folder),
-            output_alignment_folder=str(alignment_folder)
+            output_alignment_folder=str(alignment_folder),
         )
     except Exception as e:
         logger.error(f"Alignment failed: {e}")
         return
-    
+
     try:
-        logger.info(f"--- [Step 5/5] Running Consensus ---")
+        logger.info("--- [Step 5/5] Running Consensus ---")
         consensus.main(
             input_alignment_folder=str(alignment_folder),
             output_consensus_folder=str(consensus_folder),
-            run_id=run_id_str # Pass ID for logs
+            run_id=run_id_str,  # Pass ID for logs
         )
     except Exception as e:
         logger.error(f"Consensus failed: {e}")
         return
 
-    logger.info(f"--- InstaNexus Pipeline finished successfully! ---")
+    logger.info("--- InstaNexus Pipeline finished successfully! ---")
     logger.info(f"Final results in: {experiment_folder}")
 
 
@@ -274,4 +275,4 @@ def run_pipeline(args):
 #    --kmer-size 7 \
 #    --size-threshold 12 \
 #    --min-overlap 3 \
-#    --min-seq-id 0.85
\ No newline at end of file
+#    --min-seq-id 0.85
diff --git a/src/instanexus/preprocessing.py b/src/instanexus/preprocessing.py
index bf50b4f..3f864d0 100755
--- a/src/instanexus/preprocessing.py
+++ b/src/instanexus/preprocessing.py
@@ -1,16 +1,16 @@
 #!/usr/bin/env python
 
-r""" Preprocessing module for InstaNexus.
+r"""Preprocessing module for InstaNexus.
 
  ██████████   ███████████ █████  █████
-░░███░░░░███ ░█░░░███░░░█░░███  ░░███ 
- ░███   ░░███░   ░███  ░  ░███   ░███ 
- ░███    ░███    ░███     ░███   ░███ 
- ░███    ░███    ░███     ░███   ░███ 
- ░███    ███     ░███     ░███   ░███ 
- ██████████      █████    ░░████████  
-░░░░░░░░░░      ░░░░░      ░░░░░░░░   
-                                      
+░░███░░░░███ ░█░░░███░░░█░░███  ░░███
+ ░███   ░░███░   ░███  ░  ░███   ░███
+ ░███    ░███    ░███     ░███   ░███
+ ░███    ░███    ░███     ░███   ░███
+ ░███    ███     ░███     ░███   ░███
+ ██████████      █████    ░░████████
+░░░░░░░░░░      ░░░░░      ░░░░░░░░
+
 __authors__ = Marco Reverenna
 __copyright__ = Copyright 2025-2026
 __research-group__ = DTU Biosustain (Multi-omics Network Analytics) and DTU Bioengineering
@@ -20,28 +20,26 @@
 __status__ = Dev
 """
 
-# import libraries
-import os
-import re
 import json
 import logging
-import numpy as np
-import pandas as pd
-import plotly.express as px
-import plotly.graph_objects as go
 
+# import libraries
+import re
 from pathlib import Path
+
+import numpy as np
+import pandas as pd
 from Bio import SeqIO
 
+# PROJECT_ROOT = Path(__file__).resolve().parents[2]
+# JSON_DIR = PROJECT_ROOT / "json"
 
-#PROJECT_ROOT = Path(__file__).resolve().parents[2]
-#JSON_DIR = PROJECT_ROOT / "json"
 
 def get_sample_metadata(run, chain="", json_path=None):
     """Retrieve sample metadata from a JSON file based on the run and optional chain."""
     if json_path is None:
         raise ValueError("json_path must be provided.")
-    
+
     with open(json_path, "r") as f:
         all_meta = json.load(f)
 
@@ -174,22 +172,21 @@ def filter_contaminants(seqs, run, contaminants_fasta):
 logger = logging.getLogger(__name__)
 
 
-
 def main(
     input_csv: str,
     metadata_json: str,
     contaminants_fasta: str,
     chain: str,
-    #folder_outputs: str,
+    # folder_outputs: str,
     reference: bool,
-    #assembly_mode: str,
+    # assembly_mode: str,
     conf: float,
     output_csv_path: str,
-    #kmer_size: int,
-    #size_threshold: int,
-    #min_overlap: int,
-    #min_identity: float,
-    #max_mismatches: int,
+    # kmer_size: int,
+    # size_threshold: int,
+    # min_overlap: int,
+    # min_identity: float,
+    # max_mismatches: int,
 ):
     """Main function to run the preprocessing script."""
     input_csv = Path(input_csv)
@@ -197,7 +194,7 @@ def main(
     print("Starting preprocessing pipeline.")
 
     input_csv = Path(input_csv)
-    run = input_csv.stem # stem gives the filename without suffix
+    run = input_csv.stem  # stem gives the filename without suffix
 
     # load metadata
     if chain:
@@ -221,15 +218,15 @@ def main(
 
     print("Parameters loaded.")
 
-    #folder_outputs = Path(folder_outputs) / run
-    #folder_outputs.mkdir(parents=True, exist_ok=True)
+    # folder_outputs = Path(folder_outputs) / run
+    # folder_outputs.mkdir(parents=True, exist_ok=True)
 
-    #folder_name_parts = [f"comb_{assembly_mode}", f"c{conf}", f"ts{size_threshold}", f"mo{min_overlap}"]
+    # folder_name_parts = [f"comb_{assembly_mode}", f"c{conf}", f"ts{size_threshold}", f"mo{min_overlap}"]
 
     # if assembly_mode == "dbg":
     #     folder_name_parts.insert(2, f"ks{kmer_size}")
 
-    #if reference:
+    # if reference:
     #    folder_name_parts.extend([f"mi{min_identity}", f"mm{max_mismatches}"])
 
     # combination_folder_out = folder_outputs / "_".join(folder_name_parts)
@@ -238,7 +235,7 @@ def main(
     # print(f"Output folders created at: {combination_folder_out}")
 
     logger.info("Starting data cleaning...")
-    
+
     if reference:
         protein_norm = normalize_sequence(protein)
     df = pd.read_csv(input_csv)
@@ -246,16 +243,14 @@ def main(
     df["protease"] = df["experiment_name"].apply(
         lambda name: extract_protease(name, proteases)
     )
-    
+
     df = clean_dataframe(df)
-    
+
     df["cleaned_preds"] = df["preds"].apply(remove_modifications)
-    
+
     cleaned_psms = df["cleaned_preds"].tolist()
 
-    filtered_psms = filter_contaminants(
-        cleaned_psms, run, contaminants_fasta
-    )
+    filtered_psms = filter_contaminants(cleaned_psms, run, contaminants_fasta)
     df = df[df["cleaned_preds"].isin(filtered_psms)]
 
     if reference:
@@ -269,15 +264,15 @@ def main(
         df = df[df["conf"] > conf]
     else:
         logger.info("No confidence threshold applied.")
-    
+
     df.reset_index(drop=True, inplace=True)
-    
+
     logger.info("Data cleaning completed.")
     cleaned_csv_path = Path(output_csv_path)
     cleaned_csv_path.parent.mkdir(parents=True, exist_ok=True)
 
-    #cleaned_csv_path = combination_folder_out / "cleaned" / "cleaned_data.csv"
-    
+    # cleaned_csv_path = combination_folder_out / "cleaned" / "cleaned_data.csv"
+
     df.to_csv(cleaned_csv_path, index=False)
     logger.info("Cleaned data saved to: {}.".format(cleaned_csv_path))
 
@@ -371,14 +366,15 @@ def cli():
         "--output-csv-path",
         type=str,
         required=True,
-        help="Path to the output CSV file."
+        help="Path to the output CSV file.",
     )
 
     args = parser.parse_args()
 
     main(**vars(args))
 
+
 if __name__ == "__main__":
     cli()
 
-# python -m instanexus.preprocessing --input-csv inputs/bsa.csv --metadata-json json/sample_metadata.json --contaminants-fasta fasta/contaminants.fasta --output-csv-path outputs/bsa_cleaned.csv --conf 0.9 --reference
\ No newline at end of file
+# python -m instanexus.preprocessing --input-csv inputs/bsa.csv --metadata-json json/sample_metadata.json --contaminants-fasta fasta/contaminants.fasta --output-csv-path outputs/bsa_cleaned.csv --conf 0.9 --reference
diff --git a/src/instanexus/visualization.py b/src/instanexus/visualization.py
index 0b01eb5..44b6d6f 100644
--- a/src/instanexus/visualization.py
+++ b/src/instanexus/visualization.py
@@ -1,15 +1,15 @@
 #!/usr/bin/env python
 
-r""" Visualization module for InstaNexus.
+r"""Visualization module for InstaNexus.
 
  ██████████   ███████████ █████  █████
-░░███░░░░███ ░█░░░███░░░█░░███  ░░███ 
- ░███   ░░███░   ░███  ░  ░███   ░███ 
- ░███    ░███    ░███     ░███   ░███ 
- ░███    ░███    ░███     ░███   ░███ 
- ░███    ███     ░███     ░███   ░███ 
- ██████████      █████    ░░████████  
-░░░░░░░░░░      ░░░░░      ░░░░░░░░   
+░░███░░░░███ ░█░░░███░░░█░░███  ░░███
+ ░███   ░░███░   ░███  ░  ░███   ░███
+ ░███    ░███    ░███     ░███   ░███
+ ░███    ░███    ░███     ░███   ░███
+ ░███    ███     ░███     ░███   ░███
+ ██████████      █████    ░░████████
+░░░░░░░░░░      ░░░░░      ░░░░░░░░
 
 __authors__ = Marco Reverenna
 __copyright__ = Copyright 2025-2026
@@ -21,11 +21,12 @@
 """
 
 import os
-import numpy as np
-import pandas as pd
+
 import Bio.SeqIO
 import matplotlib.patches as patches
 import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
 import plotly.express as px
 import plotly.graph_objects as go
 import seaborn as sns
@@ -429,9 +430,7 @@ def plot_relative_map_distribution(run, df, reference, folder, title=False):
             x1 = fine_x[i + 1]
             y_segment = (fine_y[i] + fine_y[i + 1]) / 2.0
             x_mid = (x0 + x1) / 2.0
-            alpha = 1 - (x_mid - 0.88) / (
-                1 - 0.88
-            )
+            alpha = 1 - (x_mid - 0.88) / (1 - 0.88)
             fillcolor = (
                 f"rgba({base_color[0]}, {base_color[1]}, {base_color[2]}, {alpha:.2f})"
             )
@@ -571,7 +570,7 @@ def plot_confidence_distribution(df, folder_figures, min_conf=0, max_conf=1):
     fig.update_xaxes(
         showgrid=True,
         gridcolor="lightgray",
-        ticklabelposition="outside bottom", 
+        ticklabelposition="outside bottom",
         dtick=0.02,
     )
     fig.update_yaxes(showgrid=True, gridcolor="lightgray")
@@ -603,8 +602,8 @@ def plot_protease_distribution(protease_counts, folder_figures):
     fig.update_traces(textposition="outside", width=0.4)
 
     mm_to_px = 3.78
-    width_mm = 240 
-    height_mm = 200  
+    width_mm = 240
+    height_mm = 200
 
     fig.update_layout(
         width=int(width_mm * mm_to_px),
@@ -619,12 +618,13 @@ def plot_protease_distribution(protease_counts, folder_figures):
             color="black",
         ),
         margin=dict(t=50, b=50, l=50, r=100),
-        plot_bgcolor="white", 
+        plot_bgcolor="white",
         paper_bgcolor="white",
     )
 
     fig.write_image(f"{folder_figures}/proteases_distribution.svg")
 
+
 def map_to_protein(seq, protein, max_mismatches, min_identity):
     """Maps a sequence (`seq`) to a target protein sequence, allowing for mismatches,
     and identifies the best match based on the maximum mismatches and minimum identity threshold.
@@ -893,8 +893,7 @@ def mapping_sequences(mapped_sequences, prot_seq, title, output_folder, output_f
 
 
 def create_dataframe_from_mapped_sequences(data):
-    """Takes a list of tuples containing sequence data and returns a structured DataFrame.
-    """
+    """Takes a list of tuples containing sequence data and returns a structured DataFrame."""
     # Create the initial DataFrame
     df = pd.DataFrame(data, columns=["Sequence", "Details"])
 
@@ -1185,4 +1184,4 @@ def mapping_psms_protease_associated(
         font=dict(size=14, family="Arial, sans-serif", color="black"),
     )
 
-    fig.write_image(f"{output_folder}/{output_file}", scale=2)
\ No newline at end of file
+    fig.write_image(f"{output_folder}/{output_file}", scale=2)
diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py
index 157531c..ff64ff2 100644
--- a/tests/test_preprocessing.py
+++ b/tests/test_preprocessing.py
@@ -1,21 +1,17 @@
 #!/usr/bin/env python3
 
-import pytest
 
 from instanexus.preprocessing import remove_modifications
 
+
 def test_remove_modifications():
     assert remove_modifications("A(ox)BC(mod)D") == "ABCD"
     assert remove_modifications("A[UNIMOD:21]BC[UNIMOD:35]D") == "ABCD"
-    assert (
-        remove_modifications("A(ox)[UNIMOD:21]BC(mod)[UNIMOD:35]D") == "ABCD"
-    )
+    assert remove_modifications("A(ox)[UNIMOD:21]BC(mod)[UNIMOD:35]D") == "ABCD"
     assert remove_modifications(None) is None
     assert remove_modifications("ACD") == "ACD"
     assert remove_modifications("A(I)BCD") == "ABCD"
     assert remove_modifications("A(ox)B(I)C(mod)D") == "ABCD"
-    assert (
-        remove_modifications("A(ox)[UNIMOD:21]B(I)C(mod)[UNIMOD:35]D") == "ABCD"
-    )
+    assert remove_modifications("A(ox)[UNIMOD:21]B(I)C(mod)[UNIMOD:35]D") == "ABCD"
     assert remove_modifications("AI BCD") == "AL BCD"
-    assert remove_modifications("A(ox)I B(mod)CD") == "AL BCD"
\ No newline at end of file
+    assert remove_modifications("A(ox)I B(mod)CD") == "AL BCD"