diff --git a/docs/source/conf.py b/docs/source/conf.py index 488bea2..309f49e 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -6,23 +6,22 @@ # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information -project = 'InstaNexus' -copyright = '2025, Marco Reverenna' -author = 'Marco Reverenna' -release = '0.2.0' +project = "InstaNexus" +copyright = "2025, Marco Reverenna" +author = "Marco Reverenna" +release = "0.2.0" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration extensions = [] -templates_path = ['_templates'] +templates_path = ["_templates"] exclude_patterns = [] - # -- Options for HTML output ------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output -html_theme = 'alabaster' -html_static_path = ['_static'] +html_theme = "alabaster" +html_static_path = ["_static"] diff --git a/docs/source/tutorials/case_studies/prot_optimization_dbg.ipynb b/docs/source/tutorials/case_studies/prot_optimization_dbg.ipynb index ecb62aa..7b539a9 100755 --- a/docs/source/tutorials/case_studies/prot_optimization_dbg.ipynb +++ b/docs/source/tutorials/case_studies/prot_optimization_dbg.ipynb @@ -519,7 +519,7 @@ " sequence_type=\"contigs\",\n", " output_folder=RESULTS_DIR,\n", " reference=protein_norm,\n", - " **params\n", + " **params,\n", " )\n", "\n", " coverage_contigs = stat_contigs.get(\"coverage\")\n", @@ -553,7 +553,7 @@ " sequence_type=\"scaffolds\",\n", " output_folder=RESULTS_DIR,\n", " reference=protein_norm,\n", - " **params\n", + " **params,\n", " )\n", "\n", " coverage_scaffolds = stat_scaffolds.get(\"coverage\")\n", @@ -814,7 +814,7 @@ " sequence_type=\"contigs\",\n", " output_folder=RESULTS_DIR,\n", " reference=protein_norm,\n", - " **params\n", + " **params,\n", " )\n", " coverage_contigs = stat_contigs.get(\"coverage\")\n", "\n", @@ -847,7 +847,7 @@ " sequence_type=\"scaffolds\",\n", " output_folder=RESULTS_DIR,\n", " reference=protein_norm,\n", - " **params\n", + " **params,\n", " )\n", "\n", " coverage_scaffolds = stat_scaffolds.get(\"coverage\")\n", diff --git a/docs/source/tutorials/case_studies/prot_optimization_greedy.ipynb b/docs/source/tutorials/case_studies/prot_optimization_greedy.ipynb index cd5d149..cfbbba7 100755 --- a/docs/source/tutorials/case_studies/prot_optimization_greedy.ipynb +++ b/docs/source/tutorials/case_studies/prot_optimization_greedy.ipynb @@ -508,7 +508,7 @@ " sequence_type=\"contigs\",\n", " output_folder=\".\",\n", " reference=protein_norm,\n", - " **params\n", + " **params,\n", " )\n", "\n", " coverage_contigs = stat_contigs.get(\"coverage\")\n", @@ -556,7 +556,7 @@ " sequence_type=\"scaffolds\",\n", " output_folder=\".\",\n", " reference=protein_norm,\n", - " **params\n", + " **params,\n", " )\n", " coverage_scaffolds = stat_scaffolds.get(\"coverage\")\n", "\n", @@ -740,7 +740,7 @@ " sequence_type=\"contigs\",\n", " output_folder=\".\",\n", " reference=protein_norm,\n", - " **params\n", + " **params,\n", " )\n", " coverage_contigs = stat_contigs.get(\"coverage\")\n", "\n", @@ -784,7 +784,7 @@ " sequence_type=\"scaffolds\",\n", " output_folder=\".\",\n", " reference=protein_norm,\n", - " **params\n", + " **params,\n", " )\n", " coverage_scaffolds = stat_scaffolds.get(\"coverage\")\n", "\n", diff --git a/docs/source/tutorials/examples/dbg_variants_workflow.ipynb b/docs/source/tutorials/examples/dbg_variants_workflow.ipynb index c046c11..14677fd 100644 --- a/docs/source/tutorials/examples/dbg_variants_workflow.ipynb +++ b/docs/source/tutorials/examples/dbg_variants_workflow.ipynb @@ -48,7 +48,7 @@ "outputs": [], "source": [ "# read a pre cleaned data file\n", - "#data = pd.read_csv(\"../outputs/bsa/comb_dbg_c0.9_ks7_ts12_mo3/cleaned/cleaned_data.csv\")" + "# data = pd.read_csv(\"../outputs/bsa/comb_dbg_c0.9_ks7_ts12_mo3/cleaned/cleaned_data.csv\")" ] }, { @@ -62,9 +62,9 @@ "\n", "import re\n", "\n", - "file_name = 'bsa'\n", + "file_name = \"bsa\"\n", "\n", - "data = pd.read_csv(f'../inputs/{file_name}.csv'.format(file_name=file_name))\n", + "data = pd.read_csv(f\"../inputs/{file_name}.csv\".format(file_name=file_name))\n", "\n", "data[\"log_probs\"] = data[\"log_probs\"].replace(-1, -10)\n", "\n", @@ -99,13 +99,12 @@ "metadata": {}, "outputs": [], "source": [ - "from pathlib import Path\n", "\n", "repo_folder = Path(\"../\")\n", "\n", "filtered_psms = instanexus.preprocessing.filter_contaminants(\n", - " cleaned_psms, run, repo_folder / \"fasta/contaminants.fasta\"\n", - " )\n", + " cleaned_psms, run, repo_folder / \"fasta/contaminants.fasta\"\n", + ")\n", "\n", "data = data[data[\"preds\"].isin(filtered_psms)]" ] @@ -158,10 +157,10 @@ "source": [ "assembler = Assembler(\n", " mode=\"dbg_weighted\",\n", - " kmer_size=7, \n", - " size_threshold=0, \n", - " min_weight=2, # filter low-weight edges\n", - " refine_rounds=3, # optional iterative refinement\n", + " kmer_size=7,\n", + " size_threshold=0,\n", + " min_weight=2, # filter low-weight edges\n", + " refine_rounds=3, # optional iterative refinement\n", ")" ] }, @@ -172,7 +171,9 @@ "metadata": {}, "outputs": [], "source": [ - "scaffolds_dbg_w = assembler.run(sequences, output_folder=output_folder, protein_norm=None)" + "scaffolds_dbg_w = assembler.run(\n", + " sequences, output_folder=output_folder, protein_norm=None\n", + ")" ] }, { @@ -242,7 +243,9 @@ "metadata": {}, "outputs": [], "source": [ - "mapped_contigs = map.process_protein_contigs_scaffold(scaffolds_dbg_w, protein_norm, max_mismatches = 10, min_identity = 0.8)" + "mapped_contigs = map.process_protein_contigs_scaffold(\n", + " scaffolds_dbg_w, protein_norm, max_mismatches=10, min_identity=0.8\n", + ")" ] }, { @@ -338,8 +341,8 @@ "assembler_dbgx = Assembler(\n", " mode=\"dbgX\",\n", " kmer_size=7,\n", - " size_threshold=10, \n", - " min_weight=2, \n", + " size_threshold=10,\n", + " min_weight=2,\n", ")" ] }, @@ -351,9 +354,7 @@ "outputs": [], "source": [ "scaffolds_dbgx = assembler_dbgx.run(\n", - " sequences=sequences,\n", - " output_folder=output_folder,\n", - " protein_norm=None\n", + " sequences=sequences, output_folder=output_folder, protein_norm=None\n", ")" ] }, @@ -364,7 +365,9 @@ "metadata": {}, "outputs": [], "source": [ - "mapped_scaffolds_dbgx = map.process_protein_contigs_scaffold(scaffolds_dbgx, protein_norm, max_mismatches = 10, min_identity = 0.8)" + "mapped_scaffolds_dbgx = map.process_protein_contigs_scaffold(\n", + " scaffolds_dbgx, protein_norm, max_mismatches=10, min_identity=0.8\n", + ")" ] }, { @@ -427,7 +430,7 @@ " mode=\"fusion\",\n", " kmer_size=7,\n", " size_threshold=10,\n", - " min_overlap=3, \n", + " min_overlap=3,\n", " min_weight=2,\n", ")" ] @@ -450,9 +453,7 @@ "outputs": [], "source": [ "scaffolds_fusion = assembler_fusion.run(\n", - " sequences=sequences,\n", - " output_folder=output_folder_fusion,\n", - " protein_norm=None\n", + " sequences=sequences, output_folder=output_folder_fusion, protein_norm=None\n", ")" ] }, @@ -463,7 +464,9 @@ "metadata": {}, "outputs": [], "source": [ - "mapped_scaffolds_fusion = map.process_protein_contigs_scaffold(scaffolds_fusion, protein_norm, max_mismatches=10, min_identity=0.8)\n", + "mapped_scaffolds_fusion = map.process_protein_contigs_scaffold(\n", + " scaffolds_fusion, protein_norm, max_mismatches=10, min_identity=0.8\n", + ")\n", "\n", "# top 20\n", "mapped_scaffolds_fusion = mapped_scaffolds_fusion[:20]" diff --git a/docs/source/tutorials/examples/hybrid_workflow_with_figures.ipynb b/docs/source/tutorials/examples/hybrid_workflow_with_figures.ipynb index de904df..afbbea0 100644 --- a/docs/source/tutorials/examples/hybrid_workflow_with_figures.ipynb +++ b/docs/source/tutorials/examples/hybrid_workflow_with_figures.ipynb @@ -39,19 +39,17 @@ "import alignment as align\n", "import clustering as clus\n", "import preprocessing as prep\n", - "import compute_statistics as comp_stat\n", - "#import model_peptide_selector as selector\n", + "\n", + "# import model_peptide_selector as selector\n", "\n", "# import libraries\n", "from pathlib import Path\n", "from Bio import SeqIO\n", "\n", - "#import joblib\n", + "# import joblib\n", "import json\n", "import Bio\n", - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns" + "import pandas as pd" ] }, { @@ -131,16 +129,10 @@ "metadata": {}, "outputs": [], "source": [ - "def get_combination_name(\n", - " ass_method,\n", - " conf,\n", - " kmer_size,\n", - " size_threshold,\n", - " min_overlap\n", - "):\n", + "def get_combination_name(ass_method, conf, kmer_size, size_threshold, min_overlap):\n", " if ass_method in (\"dbg\", \"hybrid\"):\n", " return f\"comb_{ass_method}_c{conf}_ks{kmer_size}_ts{size_threshold}_mo{min_overlap}\"\n", - " \n", + "\n", " elif ass_method == \"greedy\":\n", " return f\"comb_{ass_method}_c{conf}_ts{size_threshold}_mo{min_overlap}\"" ] @@ -186,12 +178,7 @@ "metadata": {}, "outputs": [], "source": [ - "comb = get_combination_name(\n", - " ass_method,\n", - " conf,\n", - " kmer_size,\n", - " size_threshold,\n", - " min_overlap)\n", + "comb = get_combination_name(ass_method, conf, kmer_size, size_threshold, min_overlap)\n", "\n", "print(comb)" ] @@ -207,7 +194,7 @@ " \"ass_method\": ass_method,\n", " \"conf\": conf,\n", " \"size_threshold\": size_threshold,\n", - " \"min_overlap\": min_overlap\n", + " \"min_overlap\": min_overlap,\n", "}" ] }, @@ -346,7 +333,9 @@ " filtered_df = df[mask].copy()\n", " removed_count = (~mask).sum()\n", "\n", - " print(f\"Removed {removed_count} contaminant sequences, {len(filtered_df)} remaining.\")\n", + " print(\n", + " f\"Removed {removed_count} contaminant sequences, {len(filtered_df)} remaining.\"\n", + " )\n", " return filtered_df" ] }, @@ -492,9 +481,9 @@ "metadata": {}, "outputs": [], "source": [ - "greedy_scaffolds = greedy.scaffold_iterative_greedy(assembled_contigs,\n", - " min_overlap,\n", - " size_threshold)" + "greedy_scaffolds = greedy.scaffold_iterative_greedy(\n", + " assembled_contigs, min_overlap, size_threshold\n", + ")" ] }, { @@ -655,10 +644,12 @@ "outputs": [], "source": [ "mapped_scaffolds = map.process_protein_contigs_scaffold(\n", - " all_scaffolds, protein_norm, max_mismatches = 0, min_identity = 0.90\n", + " all_scaffolds, protein_norm, max_mismatches=0, min_identity=0.90\n", ")\n", "\n", - "map.mapping_substitutions(mapped_scaffolds, protein_norm, title= \"scaffolds mapped in RF-selected peptides\")" + "map.mapping_substitutions(\n", + " mapped_scaffolds, protein_norm, title=\"scaffolds mapped in RF-selected peptides\"\n", + ")" ] }, { @@ -754,14 +745,14 @@ "source": [ "clus.cluster_fasta_files(input_folder=str(scaffolds_folder_out))\n", "\n", - "fasta_input = scaffolds_folder_out / f\"scaffolds.fasta\"\n", + "fasta_input = scaffolds_folder_out / \"scaffolds.fasta\"\n", "\n", "cluster_tsv_folder = clustering_out / run_id\n", - " \n", + "\n", "clus.process_fasta_and_clusters(\n", - " fasta_file=str(fasta_input),\n", - " input_folder=str(scaffolds_folder_out),\n", - " )" + " fasta_file=str(fasta_input),\n", + " input_folder=str(scaffolds_folder_out),\n", + ")" ] }, { @@ -798,10 +789,10 @@ "outputs": [], "source": [ "cons.process_alignment_files(\n", - " align_folder=str(alignment_out),\n", - " output_folder=str(consensus_out),\n", - " run_id=run_id,\n", - " )" + " align_folder=str(alignment_out),\n", + " output_folder=str(consensus_out),\n", + " run_id=run_id,\n", + ")" ] } ], diff --git a/docs/source/tutorials/examples/hybrid_workflow_with_selector.ipynb b/docs/source/tutorials/examples/hybrid_workflow_with_selector.ipynb index ddf204f..8392977 100644 --- a/docs/source/tutorials/examples/hybrid_workflow_with_selector.ipynb +++ b/docs/source/tutorials/examples/hybrid_workflow_with_selector.ipynb @@ -44,7 +44,6 @@ "import alignment as align\n", "import clustering as clus\n", "import preprocessing as prep\n", - "import compute_statistics as comp_stat\n", "import model_peptide_selector as selector\n", "\n", "# import libraries\n", @@ -135,16 +134,10 @@ "metadata": {}, "outputs": [], "source": [ - "def get_combination_name(\n", - " ass_method,\n", - " conf,\n", - " kmer_size,\n", - " size_threshold,\n", - " min_overlap\n", - "):\n", + "def get_combination_name(ass_method, conf, kmer_size, size_threshold, min_overlap):\n", " if ass_method in (\"dbg\", \"hybrid\"):\n", " return f\"comb_{ass_method}_c{conf}_ks{kmer_size}_ts{size_threshold}_mo{min_overlap}\"\n", - " \n", + "\n", " elif ass_method == \"greedy\":\n", " return f\"comb_{ass_method}_c{conf}_ts{size_threshold}_mo{min_overlap}\"" ] @@ -207,12 +200,7 @@ } ], "source": [ - "comb = get_combination_name(\n", - " ass_method,\n", - " conf,\n", - " kmer_size,\n", - " size_threshold,\n", - " min_overlap)\n", + "comb = get_combination_name(ass_method, conf, kmer_size, size_threshold, min_overlap)\n", "\n", "print(comb)" ] @@ -228,7 +216,7 @@ " \"ass_method\": ass_method,\n", " \"conf\": conf,\n", " \"size_threshold\": size_threshold,\n", - " \"min_overlap\": min_overlap\n", + " \"min_overlap\": min_overlap,\n", "}" ] }, @@ -410,7 +398,7 @@ } ], "source": [ - "#protein_norm = prep.normalize_sequence(protein)\n", + "# protein_norm = prep.normalize_sequence(protein)\n", "\n", "df = pd.read_csv(INPUT_DIR / f\"{run}.csv\")\n", "\n", @@ -471,7 +459,9 @@ } ], "source": [ - "df[df.duplicated(subset=[\"cleaned_preds\"], keep=False)][[\"cleaned_preds\"]].value_counts()" + "df[df.duplicated(subset=[\"cleaned_preds\"], keep=False)][\n", + " [\"cleaned_preds\"]\n", + "].value_counts()" ] }, { @@ -500,7 +490,12 @@ } ], "source": [ - "print(df.loc[df[\"cleaned_preds\"].str.len().sort_values(ascending=False).index, \"cleaned_preds\"])" + "print(\n", + " df.loc[\n", + " df[\"cleaned_preds\"].str.len().sort_values(ascending=False).index,\n", + " \"cleaned_preds\",\n", + " ]\n", + ")" ] }, { @@ -543,7 +538,7 @@ "outputs": [], "source": [ "filtered_psms = prep.filter_contaminants(\n", - "cleaned_psms, run, FASTA_DIR / \"contaminants.fasta\"\n", + " cleaned_psms, run, FASTA_DIR / \"contaminants.fasta\"\n", ")" ] }, @@ -600,7 +595,7 @@ "aa_props = selector.load_aa_properties(JSON_DIR / \"aa_properties.json\")\n", "protease_rules = selector.load_protease_rules(JSON_DIR / \"protease_rules.json\")\n", "\n", - "df =selector.build_reference_free_features(df, aa_props, protease_rules)" + "df = selector.build_reference_free_features(df, aa_props, protease_rules)" ] }, { @@ -667,7 +662,9 @@ "metadata": {}, "outputs": [], "source": [ - "print(f\"Total PSMs: {len(df)}, Accepted: {len(accepted)}, Rejected: {len(df) - len(accepted)}\")" + "print(\n", + " f\"Total PSMs: {len(df)}, Accepted: {len(accepted)}, Rejected: {len(df) - len(accepted)}\"\n", + ")" ] }, { @@ -703,12 +700,14 @@ " # Force legend to appear with proper labels\n", " handles, labels = ax.get_legend_handles_labels()\n", " if handles and labels:\n", - " ax.legend(handles=handles, labels=labels, title=None, frameon=False, loc=\"upper right\")\n", + " ax.legend(\n", + " handles=handles, labels=labels, title=None, frameon=False, loc=\"upper right\"\n", + " )\n", "\n", " plt.tight_layout()\n", " Path(output_dir).mkdir(parents=True, exist_ok=True)\n", - " #plt.savefig(Path(output_dir) / \"peptide_length_distribution.svg\", format=\"svg\")\n", - " plt.show()\n" + " # plt.savefig(Path(output_dir) / \"peptide_length_distribution.svg\", format=\"svg\")\n", + " plt.show()" ] }, { @@ -759,7 +758,7 @@ "metadata": {}, "outputs": [], "source": [ - "unique_peptides = list(set(final_psms)) # remove duplicates\n", + "unique_peptides = list(set(final_psms)) # remove duplicates\n", "print(len(unique_peptides))" ] }, @@ -775,7 +774,7 @@ "\n", "for pep in sorted_peps:\n", " if not any(pep in other for other in non_redundant):\n", - " non_redundant.append(pep)\n" + " non_redundant.append(pep)" ] }, { @@ -852,7 +851,7 @@ " cluster_sorted = sorted(cluster, key=lambda x: (freq[x], len(x)), reverse=True)\n", " representatives.append(cluster_sorted[0])\n", "\n", - " return representatives, clusters\n" + " return representatives, clusters" ] }, { @@ -862,11 +861,13 @@ "metadata": {}, "outputs": [], "source": [ - "representatives, clusters = cluster_peptides_by_identity_same_length(filtered, threshold=0.9)\n", + "representatives, clusters = cluster_peptides_by_identity_same_length(\n", + " filtered, threshold=0.9\n", + ")\n", "\n", "print(f\"Original peptides: {len(filtered)}\")\n", "print(f\"Clusters found: {len(clusters)}\")\n", - "print(f\"Non-redundant peptides: {len(representatives)}\")\n" + "print(f\"Non-redundant peptides: {len(representatives)}\")" ] }, { @@ -908,7 +909,9 @@ "metadata": {}, "outputs": [], "source": [ - "map.mapping_substitutions(mapped_psms, protein_norm, title= \"psms mapped in RF-selected peptides\")" + "map.mapping_substitutions(\n", + " mapped_psms, protein_norm, title=\"psms mapped in RF-selected peptides\"\n", + ")" ] }, { @@ -918,7 +921,9 @@ "metadata": {}, "outputs": [], "source": [ - "def plot_confidence_distribution(df, accepted_mask, output_dir, filename=\"psm_selected_rf_distribution.svg\"):\n", + "def plot_confidence_distribution(\n", + " df, accepted_mask, output_dir, filename=\"psm_selected_rf_distribution.svg\"\n", + "):\n", " \"\"\"\n", " Plot and save the distribution of peptide confidence scores using Seaborn.\"\"\"\n", "\n", @@ -1011,7 +1016,7 @@ " plt.close()\n", "\n", " print(f\"Peptide length distribution saved as: {out_path}\")\n", - " return out_path\n" + " return out_path" ] }, { @@ -1162,7 +1167,9 @@ "metadata": {}, "outputs": [], "source": [ - "map.mapping_substitutions(mapped_contigs, protein_norm, title= \"psms mapped in RF-selected peptides\")" + "map.mapping_substitutions(\n", + " mapped_contigs, protein_norm, title=\"psms mapped in RF-selected peptides\"\n", + ")" ] }, { @@ -1374,7 +1381,9 @@ " assembled_scaffolds, protein_norm, max_mismatches, min_identity\n", ")\n", "\n", - "map.mapping_substitutions(mapped_scaffolds, protein_norm, title= \"scaffolds mapped in RF-selected peptides\")" + "map.mapping_substitutions(\n", + " mapped_scaffolds, protein_norm, title=\"scaffolds mapped in RF-selected peptides\"\n", + ")" ] }, { diff --git a/pyproject.toml b/pyproject.toml index 94773b5..2be9520 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,11 @@ packages = ["src/instanexus"] name = "instanexus" version = "0.2.0" description = "End-to-end workflow for de novo protein sequencing based on InstaNovo" + +authors = [ + { name = "Marco Reverenna", email = "marcor@dtu.dk" } +] + readme = "README.md" readme-content-type = "text/markdown" requires-python = ">=3.10" @@ -56,7 +61,3 @@ target-version = ['py311'] [tool.isort] profile = "black" - -authors = [ - { name = "Marco Reverenna", email = "marcor@dtu.dk" } -] diff --git a/scripts/gridsearch.py b/scripts/gridsearch.py index a5a418a..da67254 100755 --- a/scripts/gridsearch.py +++ b/scripts/gridsearch.py @@ -26,10 +26,9 @@ from concurrent.futures import ProcessPoolExecutor, as_completed from pathlib import Path -from tqdm import tqdm - from src.opt.opt_dbg import run_pipeline_dbg from src.opt.opt_greedy import run_pipeline_greedy +from tqdm import tqdm BASE_DIR = Path(__file__).resolve().parents[2] diff --git a/scripts/model_peptide_selector.py b/scripts/model_peptide_selector.py index 57e60bf..7f9adf0 100644 --- a/scripts/model_peptide_selector.py +++ b/scripts/model_peptide_selector.py @@ -18,23 +18,24 @@ """ -import re import json -import pandas as pd -import numpy as np -import preprocessing as prep +import re from math import log2 from pathlib import Path + +import joblib +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import preprocessing as prep +import seaborn as sns from sklearn.ensemble import RandomForestClassifier -from sklearn.model_selection import train_test_split from sklearn.metrics import ( - precision_recall_curve, - f1_score, average_precision_score, + f1_score, + precision_recall_curve, ) -import matplotlib.pyplot as plt -import joblib -import seaborn as sns +from sklearn.model_selection import train_test_split # side meaning where the protease cleaves # residues meaning which amino acids it cleaves after (C-side) or before (N-side diff --git a/src/instanexus/__main__.py b/src/instanexus/__main__.py index 005b782..6d83bb4 100644 --- a/src/instanexus/__main__.py +++ b/src/instanexus/__main__.py @@ -1,7 +1,9 @@ -import sys import argparse +import sys + from . import script_dbg, script_greedy + def main(): banner = r""" ______ __ __ __ @@ -15,8 +17,9 @@ def main(): parser = argparse.ArgumentParser( prog="instanexus", - description=(banner + "\n" - "InstaNexus CLI: de novo protein sequencing based on InstaNovo,\n\n" \ + description=( + banner + "\n" + "InstaNexus CLI: de novo protein sequencing based on InstaNovo,\n\n" "an end-to-end workflow from de novo peptides to proteins\n\n" "Usage:\n" " instanexus [options]\n\n" @@ -30,8 +33,8 @@ def main(): ), formatter_class=argparse.RawTextHelpFormatter, ) - - parser.add_argument('--version', action='version', version='InstaNexus 0.1.0'), + + parser.add_argument("--version", action="version", version="InstaNexus 0.1.0"), subparsers = parser.add_subparsers(dest="command", help="subcommands") @@ -50,5 +53,6 @@ def main(): else: parser.print_help() + if __name__ == "__main__": main() diff --git a/src/instanexus/alignment.py b/src/instanexus/alignment.py index 43438c1..fcc46b2 100755 --- a/src/instanexus/alignment.py +++ b/src/instanexus/alignment.py @@ -3,14 +3,14 @@ r"""Alignment module for clustered scaffolds. ██████████ ███████████ █████ █████ -░░███░░░░███ ░█░░░███░░░█░░███ ░░███ - ░███ ░░███░ ░███ ░ ░███ ░███ - ░███ ░███ ░███ ░███ ░███ - ░███ ░███ ░███ ░███ ░███ - ░███ ███ ░███ ░███ ░███ - ██████████ █████ ░░████████ -░░░░░░░░░░ ░░░░░ ░░░░░░░░ - +░░███░░░░███ ░█░░░███░░░█░░███ ░░███ + ░███ ░░███░ ░███ ░ ░███ ░███ + ░███ ░███ ░███ ░███ ░███ + ░███ ░███ ░███ ░███ ░███ + ░███ ███ ░███ ░███ ░███ + ██████████ █████ ░░████████ +░░░░░░░░░░ ░░░░░ ░░░░░░░░ + __authors__ = Marco Reverenna __copyright__ = Copyright 2025-2026 __research-group__ = DTU Biosustain (Multi-omics Network Analytics) and DTU Bioengineering @@ -26,9 +26,12 @@ import shutil import subprocess from pathlib import Path + from Bio import SeqIO -logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") +logging.basicConfig( + level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s" +) logger = logging.getLogger(__name__) @@ -49,16 +52,29 @@ def align_or_copy_fasta(fasta_file, output_file): logger.debug(f"Copied single-sequence file: {Path(fasta_file).name}") elif len(sequences) > 1: # Multiple sequences, run clustalo - logger.debug(f"Aligning {len(sequences)} sequences from {Path(fasta_file).name}...") + logger.debug( + f"Aligning {len(sequences)} sequences from {Path(fasta_file).name}..." + ) try: subprocess.run( - ["clustalo", "-i", fasta_file, "-o", output_file, "--outfmt", "fa", "--force"], + [ + "clustalo", + "-i", + fasta_file, + "-o", + output_file, + "--outfmt", + "fa", + "--force", + ], check=True, - capture_output=True, # Suppress clustalo stdout - text=True + capture_output=True, # Suppress clustalo stdout + text=True, ) except FileNotFoundError: - logger.error("clustalo command not found. Please ensure it is in your system's PATH.") + logger.error( + "clustalo command not found. Please ensure it is in your system's PATH." + ) raise except subprocess.CalledProcessError as e: logger.error(f"Clustalo failed for {fasta_file}: {e.stderr}") @@ -71,23 +87,24 @@ def align_or_copy_fasta(fasta_file, output_file): def process_alignment(input_dir: str, output_dir: str): """ Align all FASTA files from input_dir and save results in output_dir. - + Args: input_dir (str): Path to the .../cluster_fasta/ directory. output_dir (str): Path to the .../alignment/ directory. """ cluster_fasta_folder = Path(input_dir) alignment_folder = Path(output_dir) - + alignment_folder.mkdir(parents=True, exist_ok=True) if not cluster_fasta_folder.exists(): logger.error(f"Cluster FASTA folder not found: {cluster_fasta_folder}") - raise FileNotFoundError(f"Cluster FASTA folder not found: {cluster_fasta_folder}") + raise FileNotFoundError( + f"Cluster FASTA folder not found: {cluster_fasta_folder}" + ) fasta_files_to_align = [ - f for f in sorted(os.listdir(cluster_fasta_folder)) - if f.endswith(".fasta") + f for f in sorted(os.listdir(cluster_fasta_folder)) if f.endswith(".fasta") ] logger.info(f"Found {len(fasta_files_to_align)} cluster FASTA files to align.") @@ -95,15 +112,14 @@ def process_alignment(input_dir: str, output_dir: str): for fasta_file in fasta_files_to_align: fasta_path = cluster_fasta_folder / fasta_file # Save output as .afa (alignment FASTA) - output_path = alignment_folder / fasta_file.replace(".fasta", "_aligned.afa") - + output_path = alignment_folder / fasta_file.replace(".fasta", "_aligned.afa") + align_or_copy_fasta(fasta_path, output_path) logger.info("All alignment tasks completed.") -def main(input_cluster_fasta_folder: str, - output_alignment_folder: str): +def main(input_cluster_fasta_folder: str, output_alignment_folder: str): """ Main function to run the alignment script. """ @@ -113,10 +129,9 @@ def main(input_cluster_fasta_folder: str, logger.info(f"Output Folder (Alignments): {output_alignment_folder}") process_alignment( - input_dir=input_cluster_fasta_folder, - output_dir=output_alignment_folder + input_dir=input_cluster_fasta_folder, output_dir=output_alignment_folder ) - + logger.info("--- Step 4: Alignment Completed ---") @@ -127,25 +142,26 @@ def cli(): parser = argparse.ArgumentParser( description="Alignment script for clustered scaffolds." ) - + parser.add_argument( "--input-folder", type=str, required=True, - help="Path to the folder containing cluster FASTA files (e.g., .../cluster_fasta)." + help="Path to the folder containing cluster FASTA files (e.g., .../cluster_fasta).", ) parser.add_argument( "--output-folder", type=str, required=True, - help="Path to the folder to save aligned .afa files (e.g., .../alignment)." + help="Path to the folder to save aligned .afa files (e.g., .../alignment).", ) - + args = parser.parse_args() - main(input_cluster_fasta_folder=args.input_folder, - output_alignment_folder=args.output_folder - ) + main( + input_cluster_fasta_folder=args.input_folder, + output_alignment_folder=args.output_folder, + ) if __name__ == "__main__": @@ -153,4 +169,4 @@ def cli(): # python -m instanexus.alignment \ # --input-folder outputs/bsa/scaffolds/clustering/cluster_fasta \ -# --output-folder outputs/bsa/scaffolds/alignment \ No newline at end of file +# --output-folder outputs/bsa/scaffolds/alignment diff --git a/src/instanexus/assembly.py b/src/instanexus/assembly.py index 6812367..e24bf62 100644 --- a/src/instanexus/assembly.py +++ b/src/instanexus/assembly.py @@ -3,14 +3,14 @@ r"""Assembly module for InstaNexus. ██████████ ███████████ █████ █████ -░░███░░░░███ ░█░░░███░░░█░░███ ░░███ - ░███ ░░███░ ░███ ░ ░███ ░███ - ░███ ░███ ░███ ░███ ░███ - ░███ ░███ ░███ ░███ ░███ - ░███ ███ ░███ ░███ ░███ - ██████████ █████ ░░████████ -░░░░░░░░░░ ░░░░░ ░░░░░░░░ - +░░███░░░░███ ░█░░░███░░░█░░███ ░░███ + ░███ ░░███░ ░███ ░ ░███ ░███ + ░███ ░███ ░███ ░███ ░███ + ░███ ░███ ░███ ░███ ░███ + ░███ ███ ░███ ░███ ░███ + ██████████ █████ ░░████████ +░░░░░░░░░░ ░░░░░ ░░░░░░░░ + __authors__ = Marco Reverenna & Konstantinos Kalogeropoulus __copyright__ = Copyright 2024-2025 __research-group__ = DTU Biosustain (Multi-omics Network Analytics) and DTU Bioengineering @@ -20,34 +20,31 @@ __status__ = Dev """ +import argparse + # import libraries import logging +from collections import Counter, defaultdict +from dataclasses import dataclass +from itertools import combinations +from pathlib import Path +from typing import Dict, Iterable, List + +import Bio import networkx as nx import pandas as pd -import argparse -import Bio +from tqdm import tqdm from . import helpers from . import visualization as viz -from tqdm import tqdm -from pathlib import Path -from Bio import SeqIO -from collections import defaultdict -from collections import Counter -from itertools import combinations -from dataclasses import dataclass -from typing import List, Tuple, Dict, Any, Iterable, Optional - - logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) - def find_peptide_overlaps(peptides, min_overlap): """Finds overlaps between peptide sequences using a greedy approach.""" - overlaps = defaultdict(list) + overlaps = defaultdict(list) for index_a, peptide_a in tqdm(enumerate(peptides), desc="Finding overlaps"): for index_b, peptide_b in enumerate(peptides): @@ -136,14 +133,16 @@ def combine_seqs_into_scaffolds(contigs, min_overlap): def scaffold_iterative_greedy(contigs, min_overlap, size_threshold, disable_tqdm=False): """Iterative scaffolding using Greedy approach.""" + def clean(seqs): """Remove duplicates, filter by length, and sort by descending size.""" seqs = list(set(seqs)) seqs = [s for s in seqs if len(s) > size_threshold] return sorted(seqs, key=len, reverse=True) + current = clean(contigs) prev = None - + while prev != current: prev = current next_round = combine_seqs_into_scaffolds(current, min_overlap) @@ -152,7 +151,7 @@ def clean(seqs): if next_round == current: break current = next_round - + return current @@ -160,7 +159,7 @@ def get_kmers(seqs, kmer_size): """Generate k-mers of specified length from input sequences.""" kmers = [] for seq in seqs: - kmers.extend(seq[i:i+kmer_size] for i in range(len(seq)-kmer_size+1)) + kmers.extend(seq[i : i + kmer_size] for i in range(len(seq) - kmer_size + 1)) return kmers @@ -178,8 +177,7 @@ def get_kmer_counts(kmers): def get_debruijn_edges_from_kmers(kmers): - """Generate edges of a De Bruijn graph from a list of k-mers. - """ + """Generate edges of a De Bruijn graph from a list of k-mers.""" edges = set() k_1mers = defaultdict(set) for kmer in kmers: @@ -226,9 +224,7 @@ def traverse_iterative(start_node): def find_overlaps(contigs, min_overlap, disable_tqdm=False): """Find overlaps between pairs of contigs based on specified minimum overlap.""" overlaps = [] - total_pairs = sum( - 1 for _ in combinations(contigs, 2) - ) + total_pairs = sum(1 for _ in combinations(contigs, 2)) with tqdm(total=total_pairs, desc="Finding overlaps", disable=disable_tqdm) as pbar: for a, b in combinations( contigs, 2 @@ -284,7 +280,6 @@ def scaffold_iterative_dbg(contigs, min_overlap, size_threshold, disable_tqdm=Fa return sorted(current, key=len, reverse=True) - def get_kmers(sequences: Iterable[str], kmer_size: int) -> List[str]: """Generate all k-mers from a list of sequences.""" kmers = [] @@ -294,7 +289,7 @@ def get_kmers(sequences: Iterable[str], kmer_size: int) -> List[str]: L = len(seq) if L < kmer_size: continue - kmers.extend(seq[i:i+kmer_size] for i in range(L - kmer_size + 1)) + kmers.extend(seq[i : i + kmer_size] for i in range(L - kmer_size + 1)) return kmers @@ -325,7 +320,9 @@ def build_dbg_from_kmers(kmers: Iterable[str]) -> nx.DiGraph: def filter_low_weight_edges(G: nx.DiGraph, min_weight: int = 2) -> nx.DiGraph: """Remove edges with weight < min_weight (light error correction).""" - to_remove = [(u, v) for u, v, d in G.edges(data=True) if d.get("weight", 0) < min_weight] + to_remove = [ + (u, v) for u, v, d in G.edges(data=True) if d.get("weight", 0) < min_weight + ] G.remove_edges_from(to_remove) # drop isolated nodes iso = [n for n in G.nodes if G.in_degree(n) == 0 and G.out_degree(n) == 0] @@ -335,9 +332,10 @@ def filter_low_weight_edges(G: nx.DiGraph, min_weight: int = 2) -> nx.DiGraph: @dataclass class ContigPath: - nodes: List[str] # list of (k-1)-mer node labels in path order - seq: str # assembled sequence - weights: List[int] # edge weights along the path + nodes: List[str] # list of (k-1)-mer node labels in path order + seq: str # assembled sequence + weights: List[int] # edge weights along the path + def _extend_linear_path(G: nx.DiGraph, start: str, succ: str) -> ContigPath: """Extend from start→succ while in/out-degree == 1 (unbranched).""" @@ -405,13 +403,20 @@ class ContigScore: max_weight: int score: float -def score_contig(cp: ContigPath, alpha_len: float = 1.0, alpha_cov: float = 1.0, alpha_min: float = 0.2) -> ContigScore: + +def score_contig( + cp: ContigPath, + alpha_len: float = 1.0, + alpha_cov: float = 1.0, + alpha_min: float = 0.2, +) -> ContigScore: """ Simple reference-free score combining length and coverage: score = alpha_len * log(length) + alpha_cov * mean_weight + alpha_min * min_weight Adjust alphas to your data. You can also plug-in intensity-based terms later. """ import math + if cp.weights: mean_w = sum(cp.weights) / len(cp.weights) min_w = min(cp.weights) @@ -420,26 +425,37 @@ def score_contig(cp: ContigPath, alpha_len: float = 1.0, alpha_cov: float = 1.0, mean_w = min_w = max_w = 0.0 L = len(cp.seq) composite = alpha_len * math.log(max(L, 2)) + alpha_cov * mean_w + alpha_min * min_w - return ContigScore(seq=cp.seq, length=L, mean_weight=mean_w, min_weight=min_w, max_weight=max_w, score=composite) + return ContigScore( + seq=cp.seq, + length=L, + mean_weight=mean_w, + min_weight=min_w, + max_weight=max_w, + score=composite, + ) -def rank_contigs_by_score(contigs: List[ContigPath], - alpha_len: float = 1.0, - alpha_cov: float = 1.0, - alpha_min: float = 0.2) -> List[ContigScore]: +def rank_contigs_by_score( + contigs: List[ContigPath], + alpha_len: float = 1.0, + alpha_cov: float = 1.0, + alpha_min: float = 0.2, +) -> List[ContigScore]: scored = [score_contig(c, alpha_len, alpha_cov, alpha_min) for c in contigs] return sorted(scored, key=lambda s: (s.score, s.length), reverse=True) -def scaffold_iterative_dbgx(seqs: List[str], - kmer_size: int, - size_threshold: int = 10, - min_weight: int = 2, - max_rounds: int = 5, - patience: int = 2, - alpha_len: float = 1.0, - alpha_cov: float = 1.0, - alpha_min: float = 0.2) -> List[str]: +def scaffold_iterative_dbgx( + seqs: List[str], + kmer_size: int, + size_threshold: int = 10, + min_weight: int = 2, + max_rounds: int = 5, + patience: int = 2, + alpha_len: float = 1.0, + alpha_cov: float = 1.0, + alpha_min: float = 0.2, +) -> List[str]: """ Optional refinement: rebuild DBG from current contigs → collapse → filter by size → repeat @@ -463,7 +479,9 @@ def scaffold_iterative_dbgx(seqs: List[str], seqs_new = [r.seq for r in ranked] # improvement heuristic: fewer contigs or longer top contig - improved = (len(seqs_new) < len(best)) or (seqs_new and best and len(seqs_new[0]) > len(best[0])) + improved = (len(seqs_new) < len(best)) or ( + seqs_new and best and len(seqs_new[0]) > len(best[0]) + ) if improved: best = seqs_new no_improve = 0 @@ -494,7 +512,7 @@ def extend_path_dbg(G, contig, k, min_weight=1): # Extension forwards while extended: extended = False - suffix = seq[-(k-1):] + suffix = seq[-(k - 1) :] if suffix not in G or G.out_degree(suffix) == 0: break @@ -522,7 +540,7 @@ def extend_path_dbg(G, contig, k, min_weight=1): extended = True while extended: extended = False - prefix = seq[:k-1] + prefix = seq[: k - 1] if prefix not in G or G.in_degree(prefix) == 0: break @@ -551,22 +569,26 @@ def extend_path_dbg(G, contig, k, min_weight=1): class Assembler: """Unified assembler supporting 'greedy', 'dbg' and 'dbgx' (NetworkX) modes.""" - def __init__(self, - mode: str = "greedy", - min_overlap: int = 4, - size_threshold: int = 10, - kmer_size: int = 6, - min_identity: float = 0.8, - max_mismatches: int = 10, - # dbgx-specific: - min_weight: int = 2, - refine_rounds: int = 0, # 0 = no refine, >0 enables iterative refine - refine_patience: int = 2, - alpha_len: float = 1.0, - alpha_cov: float = 1.0, - alpha_min: float = 0.2): + def __init__( + self, + mode: str = "greedy", + min_overlap: int = 4, + size_threshold: int = 10, + kmer_size: int = 6, + min_identity: float = 0.8, + max_mismatches: int = 10, + # dbgx-specific: + min_weight: int = 2, + refine_rounds: int = 0, # 0 = no refine, >0 enables iterative refine + refine_patience: int = 2, + alpha_len: float = 1.0, + alpha_cov: float = 1.0, + alpha_min: float = 0.2, + ): if mode not in ["greedy", "dbg", "dbg_weighted", "dbgX", "fusion"]: - raise ValueError("mode must be 'greedy', 'dbg', 'dbg_weighted', 'dbgX' or 'fusion'") + raise ValueError( + "mode must be 'greedy', 'dbg', 'dbg_weighted', 'dbgX' or 'fusion'" + ) self.mode = mode self.min_overlap = min_overlap @@ -585,12 +607,16 @@ def __init__(self, def assemble_greedy(self, sequences): - logger.info(f"[Assembler] Running Greedy assembly (min_overlap={self.min_overlap})") + logger.info( + f"[Assembler] Running Greedy assembly (min_overlap={self.min_overlap})" + ) contigs = assemble_contigs_greedy(sequences, self.min_overlap) contigs = list(set(contigs)) contigs = sorted(contigs, key=len, reverse=True) - scaffolds = scaffold_iterative_greedy(contigs, self.min_overlap, self.size_threshold) + scaffolds = scaffold_iterative_greedy( + contigs, self.min_overlap, self.size_threshold + ) return scaffolds @@ -604,13 +630,16 @@ def assemble_dbg(self, sequences): contigs = sorted(contigs, key=len, reverse=True) contigs = [seq for seq in contigs if len(seq) > self.size_threshold] - scaffolds = scaffold_iterative_dbg(contigs, self.min_overlap, self.size_threshold) + scaffolds = scaffold_iterative_dbg( + contigs, self.min_overlap, self.size_threshold + ) return scaffolds - def assemble_dbg_weighted(self, sequences: List[str]) -> List[str]: - logger.info(f"[Assembler] Running DBG weighted (k={self.kmer_size}, min_weight={self.min_weight}, refine_rounds={self.refine_rounds})") + logger.info( + f"[Assembler] Running DBG weighted (k={self.kmer_size}, min_weight={self.min_weight}, refine_rounds={self.refine_rounds})" + ) kmers = get_kmers(sequences, self.kmer_size) if not kmers: @@ -625,7 +654,9 @@ def assemble_dbg_weighted(self, sequences: List[str]) -> List[str]: logger.warning("No contigs assembled from DBGX; returning empty result.") return [] - ranked = rank_contigs_by_score(contigs_cp, self.alpha_len, self.alpha_cov, self.alpha_min) + ranked = rank_contigs_by_score( + contigs_cp, self.alpha_len, self.alpha_cov, self.alpha_min + ) contigs = [r.seq for r in ranked] if self.refine_rounds and self.refine_rounds > 0: @@ -644,7 +675,7 @@ def assemble_dbg_weighted(self, sequences: List[str]) -> List[str]: scaffolds = list(contigs) return scaffolds - + def assemble_dbgX(self, sequences): logger.info(f"[Assembler] Running DBG-Extension (k={self.kmer_size})") @@ -656,13 +687,15 @@ def assemble_dbgX(self, sequences): contigs = [c.seq for c in contigs_cp] logger.info("Extending contigs using DBG paths (coverage-aware)...") - extended_contigs = [extend_path_dbg(G, c, self.kmer_size, self.min_weight) for c in contigs] + extended_contigs = [ + extend_path_dbg(G, c, self.kmer_size, self.min_weight) for c in contigs + ] extended_contigs = sorted(set(extended_contigs), key=len, reverse=True) return extended_contigs - + def assemble_fusion(self, sequences): - logger.info(f"[Assembler] Running FUSION (DBG weighted + greedy merge)") + logger.info("[Assembler] Running FUSION (DBG weighted + greedy merge)") contigs_dbg_weighted = self.assemble_dbg_weighted(sequences) @@ -681,12 +714,11 @@ def assemble_fusion(self, sequences): return fused - def run(self, sequences: List[str]): if not sequences: logger.error("No valid sequences provided for assembly.") raise ValueError("Input sequences list is empty.") - + if self.mode == "greedy": return self.assemble_greedy(sequences) elif self.mode == "dbg": @@ -708,29 +740,33 @@ def main( min_overlap: int, size_threshold: int, reference: bool, - chain: str, + chain: str, min_identity: float, max_mismatches: int, ): """Main function for standalone assembly.""" - protein_norm = None # None means no reference mode + protein_norm = None # None means no reference mode if reference: - logger.info(f"Reference mode enabled. Loading reference protein...") + logger.info("Reference mode enabled. Loading reference protein...") if not metadata_json_path: - raise ValueError("metadata_json_path is required when reference mode is enabled.") - + raise ValueError( + "metadata_json_path is required when reference mode is enabled." + ) + try: - run_name = Path(input_csv_path).stem # extract run name from input file - meta = helpers.get_sample_metadata(run=run_name, chain=chain, json_path=metadata_json_path) - protein= meta["protein"] + run_name = Path(input_csv_path).stem # extract run name from input file + meta = helpers.get_sample_metadata( + run=run_name, chain=chain, json_path=metadata_json_path + ) + protein = meta["protein"] protein_norm = helpers.get_normalized_protein(protein) logger.info("Reference protein loaded and normalized successfully.") - + except Exception as e: logger.error(f"Failed to get reference protein: {e}") logger.warning("Disabling reference mode.") - reference = False # if fails, disable reference mode + reference = False # if fails, disable reference mode input_data = Path(input_csv_path) @@ -750,10 +786,10 @@ def main( size_threshold=size_threshold, kmer_size=kmer_size, min_identity=min_identity, - max_mismatches=max_mismatches + max_mismatches=max_mismatches, ) - scaffolds = assembler.run(sequences = sequences) + scaffolds = assembler.run(sequences=sequences) output_path = Path(output_scaffolds_path) output_folder = output_path.parent.mkdir(parents=True, exist_ok=True) @@ -771,7 +807,9 @@ def main( "fasta", ) - logger.info(f"Assembly completed — {len(scaffolds)} scaffolds saved to {output_path}") + logger.info( + f"Assembly completed — {len(scaffolds)} scaffolds saved to {output_path}" + ) if protein_norm: logger.info("Reference mode: calculating statistics...") @@ -782,7 +820,7 @@ def main( assembled_contigs=scaffolds, target_protein=protein_norm, max_mismatches=max_mismatches, - min_identity=min_identity, + min_identity=min_identity, ) df_scaffolds_mapped = viz.create_dataframe_from_mapped_sequences( data=mapped_scaffolds @@ -817,7 +855,7 @@ def cli(): parser.add_argument( "--metadata-json-path", type=str, - default=None, # Optional, but required by --reference + default=None, # Optional, but required by --reference help="Path to sample_metadata.json (required for --reference).", ) parser.add_argument( @@ -855,19 +893,19 @@ def cli(): "--chain", type=str, default="", - help="Specify chain type (light/heavy) required for reference lookup." + help="Specify chain type (light/heavy) required for reference lookup.", ) parser.add_argument( "--min-identity", type=float, default=0.8, - help="Minimum identity for reference mapping." + help="Minimum identity for reference mapping.", ) parser.add_argument( "--max-mismatches", type=int, default=10, - help="Maximum mismatches for reference mapping." + help="Maximum mismatches for reference mapping.", ) args = parser.parse_args() @@ -876,7 +914,7 @@ def cli(): if args.assembly_mode != "dbg": args.kmer_size = 0 logger.info("Ignoring kmer_size (used only for DBG mode).") - + if args.reference and not args.metadata_json_path: parser.error("--metadata-json-path is required when --reference is enabled.") diff --git a/src/instanexus/clustering.py b/src/instanexus/clustering.py index c7d0633..f5bf631 100755 --- a/src/instanexus/clustering.py +++ b/src/instanexus/clustering.py @@ -3,14 +3,14 @@ r"""Clustering module for assembled scaffolds. ██████████ ███████████ █████ █████ -░░███░░░░███ ░█░░░███░░░█░░███ ░░███ - ░███ ░░███░ ░███ ░ ░███ ░███ - ░███ ░███ ░███ ░███ ░███ - ░███ ░███ ░███ ░███ ░███ - ░███ ███ ░███ ░███ ░███ - ██████████ █████ ░░████████ -░░░░░░░D░░ ░░░░░ ░░░░░░░░ - +░░███░░░░███ ░█░░░███░░░█░░███ ░░███ + ░███ ░░███░ ░███ ░ ░███ ░███ + ░███ ░███ ░███ ░███ ░███ + ░███ ░███ ░███ ░███ ░███ + ░███ ███ ░███ ░███ ░███ + ██████████ █████ ░░████████ +░░░░░░░D░░ ░░░░░ ░░░░░░░░ + __authors__ = Marco Reverenna & Konstantinos Kalogeropoulus __copyright__ = Copyright 2025-2026 __research-group__ = DTU Biosustain (Multi-omics Network Analytics) and DTU Bioengineering @@ -24,25 +24,28 @@ import logging import shutil import subprocess -import pandas as pd from pathlib import Path from tempfile import mkdtemp + +import pandas as pd from Bio import SeqIO from tqdm import tqdm -logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") +logging.basicConfig( + level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s" +) logger = logging.getLogger(__name__) def cluster_fasta_files( - scaffolds_folder: Path, - clustering_dir: Path, - min_seq_id: float = 0.85, - coverage: float = 0.8 + scaffolds_folder: Path, + clustering_dir: Path, + min_seq_id: float = 0.85, + coverage: float = 0.8, ): """ Runs mmseqs easy-cluster on the scaffolds.fasta file. - + This is based on the user's provided function, but modified to be more robust by targeting only 'scaffolds.fasta' instead of looping. """ @@ -51,7 +54,7 @@ def cluster_fasta_files( cluster_fasta_dir.mkdir(exist_ok=True) temp_dir = mkdtemp(prefix="mmseqs-") - + fasta_path = scaffolds_folder / "scaffolds.fasta" if not fasta_path.is_file(): logger.error(f"Input file not found, skipping clustering: {fasta_path}") @@ -124,22 +127,26 @@ def process_fasta_and_clusters(fasta_file: Path, scaffolds_folder: Path): return try: - cluster_df = pd.read_csv(cluster_tsv, sep="\t", header=None, names=["cluster", "contig"]) + cluster_df = pd.read_csv( + cluster_tsv, sep="\t", header=None, names=["cluster", "contig"] + ) except pd.errors.EmptyDataError: - logger.warning(f"Cluster TSV file is empty: {cluster_tsv}. No clusters to process.") + logger.warning( + f"Cluster TSV file is empty: {cluster_tsv}. No clusters to process." + ) return - + records = list(SeqIO.parse(fasta_file, "fasta")) - records_dict = SeqIO.to_dict(records) # Faster lookup + records_dict = SeqIO.to_dict(records) # Faster lookup clusters = cluster_df["cluster"].unique() logger.info(f"Splitting {len(records)} sequences into {len(clusters)} clusters.") - - width = max(4, len(str(len(clusters)))) # For zfill padding + + width = max(4, len(str(len(clusters)))) # For zfill padding for i, cluster_id in enumerate(tqdm(clusters, desc="Processing clusters")): contig_ids = cluster_df[cluster_df["cluster"] == cluster_id]["contig"].values - + contig_records = [] for contig_id in contig_ids: if contig_id in records_dict: @@ -153,16 +160,12 @@ def process_fasta_and_clusters(fasta_file: Path, scaffolds_folder: Path): logger.info(f"All cluster FASTA files created in: {cluster_fasta_dir}") -def main( - input_scaffolds_folder: str, - min_seq_id: float, - coverage: float -): +def main(input_scaffolds_folder: str, min_seq_id: float, coverage: float): """ Main function to run the clustering script. """ logger.info("--- Starting Step 3: Clustering ---") - + scaffolds_folder_path = Path(input_scaffolds_folder) clustering_out_path = scaffolds_folder_path / "clustering" fasta_input_path = scaffolds_folder_path / "scaffolds.fasta" @@ -172,25 +175,27 @@ def main( if not scaffolds_folder_path.exists(): logger.error(f"Scaffolds folder does not exist: {scaffolds_folder_path}") - raise FileNotFoundError(f"Scaffolds folder does not exist: {scaffolds_folder_path}") - + raise FileNotFoundError( + f"Scaffolds folder does not exist: {scaffolds_folder_path}" + ) + logger.info("Running cluster_fasta_files...") cluster_fasta_files( scaffolds_folder=scaffolds_folder_path, clustering_dir=clustering_out_path, min_seq_id=min_seq_id, - coverage=coverage + coverage=coverage, ) - + # --- Step 2: Process the cluster results (split FASTA) --- logger.info("Running process_fasta_and_clusters...") process_fasta_and_clusters( - fasta_file=fasta_input_path, - scaffolds_folder=scaffolds_folder_path + fasta_file=fasta_input_path, scaffolds_folder=scaffolds_folder_path ) - + logger.info("--- Step 3: Clustering Completed ---") + def cli(): """ Command-line interface (CLI) for the clustering script. @@ -198,35 +203,36 @@ def cli(): parser = argparse.ArgumentParser( description="Clustering script for assembled scaffolds." ) - + parser.add_argument( "--input-scaffolds-folder", type=str, required=True, - help="Path to the folder containing 'scaffolds.fasta' (e.g., 'outputs/bsa/scaffolds')." + help="Path to the folder containing 'scaffolds.fasta' (e.g., 'outputs/bsa/scaffolds').", ) parser.add_argument( "--min-seq-id", type=float, default=0.85, - help="Minimum sequence identity for mmseqs (default: 0.85)." + help="Minimum sequence identity for mmseqs (default: 0.85).", ) parser.add_argument( "--coverage", type=float, default=0.8, - help="Coverage parameter (-c) for mmseqs (default: 0.8)." + help="Coverage parameter (-c) for mmseqs (default: 0.8).", ) - + args = parser.parse_args() - + main( input_scaffolds_folder=args.input_scaffolds_folder, min_seq_id=args.min_seq_id, - coverage=args.coverage + coverage=args.coverage, ) + if __name__ == "__main__": cli() -# python -m instanexus/clustering --input-scaffolds-folder outputs/bsa/scaffolds --min-seq-id 0.85 --coverage 0.8 \ No newline at end of file +# python -m instanexus/clustering --input-scaffolds-folder outputs/bsa/scaffolds --min-seq-id 0.85 --coverage 0.8 diff --git a/src/instanexus/consensus.py b/src/instanexus/consensus.py index d8cfa14..5b81e32 100755 --- a/src/instanexus/consensus.py +++ b/src/instanexus/consensus.py @@ -3,14 +3,14 @@ r"""Consensus generation module for aligned scaffolds. ██████████ ███████████ █████ █████ -░░███░░░░███ ░█░░░███░░░█░░███ ░░███ - ░███ ░░███░ ░███ ░ ░███ ░███ - ░███ ░███ ░███ ░███ ░███ - ░███ ░███ ░███ ░███ ░███ - ░███ ███ ░███ ░███ ░███ - ██████████ █████ ░░████████ -░░░░░░░░░░ ░░░░░ ░░░░░░░░ - +░░███░░░░███ ░█░░░███░░░█░░███ ░░███ + ░███ ░░███░ ░███ ░ ░███ ░███ + ░███ ░███ ░███ ░███ ░███ + ░███ ░███ ░███ ░███ ░███ + ░███ ███ ░███ ░███ ░███ + ██████████ █████ ░░████████ +░░░░░░░░░░ ░░░░░ ░░░░░░░░ + __authors__ = Marco Reverenna __copyright__ = Copyright 2025-2026 __research-group__ = DTU Biosustain (Multi-omics Network Analytics) and DTU Bioengineering @@ -21,28 +21,28 @@ """ import argparse +import json import logging import os -import json import re import statistics -from tqdm import tqdm - -from pathlib import Path from collections import Counter -from Bio import SeqIO -from Bio.Seq import Seq -from Bio.SeqRecord import SeqRecord +from pathlib import Path -import Bio.SeqRecord -import pandas as pd +import Bio.SeqRecord +import logomaker import matplotlib.pyplot as plt +import pandas as pd import seaborn as sns -import logomaker +from Bio import SeqIO +from tqdm import tqdm -logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") +logging.basicConfig( + level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s" +) logger = logging.getLogger(__name__) + def generate_pssm(aligned_records): """Generates a Position-Specific Scoring Matrix (PSSM) from aligned sequences.""" pssm = {} @@ -52,17 +52,17 @@ def generate_pssm(aligned_records): pssm[i] = Counter() if aa != "-": # Ignore gaps pssm[i][aa] += 1 - + # Normalize to frequencies for i in pssm: total = sum(pssm[i].values()) if total > 0: for aa in pssm[i]: pssm[i][aa] /= total - + pssm_df = pd.DataFrame(pssm).fillna(0).T pssm_df.index = pssm_df.index + 1 # 1-based indexing for positions - pssm_df = pssm_df.sort_index(axis=1) # Sort columns alphabetically (A, C, D...) + pssm_df = pssm_df.sort_index(axis=1) # Sort columns alphabetically (A, C, D...) return pssm_df @@ -80,33 +80,33 @@ def generate_consensus(pssm_df, threshold=0.6): def plot_heatmap2(pssm_df, output_file): """Generates and saves a Seaborn heatmap from a PSSM.""" df_t = pssm_df.T - + max_fig_width = 50 - fig_width = min(len(pssm_df) / 1.5, max_fig_width) - fig_height = max(5, len(df_t) / 4) + fig_width = min(len(pssm_df) / 1.5, max_fig_width) + fig_height = max(5, len(df_t) / 4) fig, ax = plt.subplots(figsize=(fig_width, fig_height)) - + sns.heatmap( df_t, ax=ax, cmap="Reds", linewidths=0.1, - linecolor='lightgrey', - cbar_kws={'label': 'Frequency'} + linecolor="lightgrey", + cbar_kws={"label": "Frequency"}, ) - + tick_positions = list(range(0, df_t.shape[1], 5)) - tick_labels = [str(t+1) for t in tick_positions] - ax.set_xticks([t + 0.5 for t in tick_positions]) + tick_labels = [str(t + 1) for t in tick_positions] + ax.set_xticks([t + 0.5 for t in tick_positions]) ax.set_xticklabels(tick_labels, rotation=0) ax.set_xlabel("Position", fontsize=14) ax.set_ylabel("Amino Acid", fontsize=14) - + plt.tight_layout() plt.savefig(output_file, format="svg", dpi=150) - plt.close(fig) + plt.close(fig) def plot_logo2(pssm_df, output_file): @@ -114,7 +114,7 @@ def plot_logo2(pssm_df, output_file): max_fig_width = 50 fig_width = min(len(pssm_df) / 1.5, max_fig_width) fig, ax = plt.subplots(figsize=[fig_width, 3]) - + logo = logomaker.Logo( pssm_df, ax=ax, @@ -131,7 +131,7 @@ def plot_logo2(pssm_df, output_file): width=0.85, baseline_width=0.5, ) - + logo.style_xticks(anchor=0, rotation=0, spacing=1, fontsize=16, ha="center") ax.set_yticks([0, 0.5, 1]) ax.set_yticklabels([0, 0.5, 1], fontsize=16) @@ -165,15 +165,15 @@ def run_consensus_generation(align_folder: str, output_folder: str, run_id: str logo_dir.mkdir(exist_ok=True) alignment_files = [f for f in sorted(os.listdir(align_path)) if f.endswith(".afa")] - + logger.info(f"Found {len(alignment_files)} aligned .afa files.") - + for alignment_file in tqdm( alignment_files, desc=f"[{run_id}] Generating Consensus" if run_id else "Generating Consensus", ): alignment_path = align_path / alignment_file - base_filename = alignment_path.stem # e.g., 'scaffold_0001_aligned' + base_filename = alignment_path.stem # e.g., 'scaffold_0001_aligned' aligned_records = list(SeqIO.parse(alignment_path, "fasta")) if not aligned_records: @@ -206,7 +206,7 @@ def generate_consensus_stats(consensus_base_folder): """Calculates statistics on the generated consensus FASTA files.""" consensus_base_folder = Path(consensus_base_folder) fasta_files = list(consensus_base_folder.glob("*_consensus.fasta")) - + if not fasta_files: logger.warning("No consensus FASTA files found, skipping stats.") return @@ -214,26 +214,28 @@ def generate_consensus_stats(consensus_base_folder): lengths = [] gap_lengths_all = [] sequences_without_gaps = 0 - + for fasta_path in fasta_files: record = next(SeqIO.parse(fasta_path, "fasta")) seq = str(record.seq) seq_len = len(seq) lengths.append(seq_len) - + if "-" not in seq: sequences_without_gaps += 1 else: gap_lengths = [len(g.group()) for g in re.finditer(r"-+", seq)] gap_lengths_all.extend(gap_lengths) - + longest_gap = max(gap_lengths_all) if gap_lengths_all else 0 shortest_gap = min(gap_lengths_all) if gap_lengths_all else 0 - percent_no_gaps = (sequences_without_gaps / len(fasta_files) * 100) if fasta_files else 0 + percent_no_gaps = ( + (sequences_without_gaps / len(fasta_files) * 100) if fasta_files else 0 + ) max_length = max(lengths) if lengths else 0 min_length = min(lengths) if lengths else 0 avg_length = statistics.mean(lengths) if lengths else 0 - + stats = { "n_consensus_files": len(fasta_files), "longest_gap": longest_gap, @@ -243,18 +245,14 @@ def generate_consensus_stats(consensus_base_folder): "min_consensus_length": min_length, "avg_consensus_length": round(avg_length, 2), } - + stats_path = consensus_base_folder.parent / "consensus_stats.json" with open(stats_path, "w") as f: json.dump(stats, f, indent=4) logger.info(f"Consensus statistics saved to: {stats_path}") -def main( - input_alignment_folder: str, - output_consensus_folder: str, - run_id: str = "" -): +def main(input_alignment_folder: str, output_consensus_folder: str, run_id: str = ""): """ Main function to run the consensus generation script. """ @@ -271,16 +269,14 @@ def main( consensus_fasta_dir = run_consensus_generation( align_folder=str(align_folder_in), output_folder=str(consensus_folder_out), - run_id=run_id + run_id=run_id, ) - + # --- Step 2: Generate statistics on the consensus files --- if consensus_fasta_dir: logger.info("Running consensus statistics generation...") - generate_consensus_stats( - consensus_base_folder=consensus_fasta_dir - ) - + generate_consensus_stats(consensus_base_folder=consensus_fasta_dir) + logger.info("--- Step 5: Consensus Generation Completed ---") @@ -291,37 +287,38 @@ def cli(): parser = argparse.ArgumentParser( description="Consensus generation script for aligned scaffolds." ) - + parser.add_argument( "--input-folder", type=str, required=True, - help="Path to the folder containing aligned .afa files (e.g., .../alignment)." + help="Path to the folder containing aligned .afa files (e.g., .../alignment).", ) parser.add_argument( "--output-folder", type=str, required=True, - help="Path to the folder to save consensus outputs (e.g., .../consensus)." + help="Path to the folder to save consensus outputs (e.g., .../consensus).", ) parser.add_argument( "--run-id", type=str, default="", - help="Optional ID to display in the progress bar." + help="Optional ID to display in the progress bar.", ) - + args = parser.parse_args() - + main( input_alignment_folder=args.input_folder, output_consensus_folder=args.output_folder, - run_id=args.run_id + run_id=args.run_id, ) + if __name__ == "__main__": cli() # python -m instanexus.consensus \ # --input-folder outputs/bsa/scaffolds/alignment \ -# --output-folder outputs/bsa/scaffolds/consensus \ No newline at end of file +# --output-folder outputs/bsa/scaffolds/consensus diff --git a/src/instanexus/helpers.py b/src/instanexus/helpers.py index 747f443..6bd6a5f 100644 --- a/src/instanexus/helpers.py +++ b/src/instanexus/helpers.py @@ -3,14 +3,14 @@ r""" ██████████ ███████████ █████ █████ -░░███░░░░███ ░█░░░███░░░█░░███ ░░███ - ░███ ░░███░ ░███ ░ ░███ ░███ - ░███ ░███ ░███ ░███ ░███ - ░███ ░███ ░███ ░███ ░███ - ░███ ███ ░███ ░███ ░███ - ██████████ █████ ░░████████ -░░░░░░░░░░ ░░░░░ ░░░░░░░░ - +░░███░░░░███ ░█░░░███░░░█░░███ ░░███ + ░███ ░░███░ ░███ ░ ░███ ░███ + ░███ ░███ ░███ ░███ ░███ + ░███ ░███ ░███ ░███ ░███ + ░███ ███ ░███ ░███ ░███ + ██████████ █████ ░░████████ +░░░░░░░░░░ ░░░░░ ░░░░░░░░ + __authors__ = Marco Reverenna __copyright__ = Copyright 2025-2026 __research-group__ = DTU Biosustain (Multi-omics Network Analytics) and DTU Bioengineering @@ -20,15 +20,11 @@ __status__ = Dev """ -# import libraries -import numpy as np -import pandas as pd -import os import json -import plotly.express as px -import plotly.graph_objects as go +import os from pathlib import Path +# import libraries PROJECT_ROOT = Path(__file__).resolve().parents[2] JSON_DIR = PROJECT_ROOT / "json" @@ -163,4 +159,4 @@ def compute_assembly_statistics(df, sequence_type, output_folder, reference, **p with open(output_path, "w") as file: json.dump(statistics, file, indent=4) - return statistics \ No newline at end of file + return statistics diff --git a/src/instanexus/main.py b/src/instanexus/main.py index 72d7fc5..8720ed4 100644 --- a/src/instanexus/main.py +++ b/src/instanexus/main.py @@ -3,14 +3,14 @@ r"""Pipeline script for InstaNexus. ██████████ ███████████ █████ █████ -░░███░░░░███ ░█░░░███░░░█░░███ ░░███ - ░███ ░░███░ ░███ ░ ░███ ░███ - ░███ ░███ ░███ ░███ ░███ - ░███ ░███ ░███ ░███ ░███ - ░███ ███ ░███ ░███ ░███ - ██████████ █████ ░░████████ -░░░░░░░░░░ ░░░░░ ░░░░░░░░ - +░░███░░░░███ ░█░░░███░░░█░░███ ░░███ + ░███ ░░███░ ░███ ░ ░███ ░███ + ░███ ░███ ░███ ░███ ░███ + ░███ ░███ ░███ ░███ ░███ + ░███ ███ ░███ ░███ ░███ + ██████████ █████ ░░████████ +░░░░░░░░░░ ░░░░░ ░░░░░░░░ + __authors__ = Marco Reverenna __copyright__ = Copyright 2025-2026 __research-group__ = DTU Biosustain (Multi-omics Network Analytics) and DTU Bioengineering @@ -23,14 +23,13 @@ import argparse import logging from pathlib import Path -from . import preprocessing -from . import assembly -from . import clustering -from . import alignment -from . import consensus + +from . import alignment, assembly, clustering, consensus, preprocessing # Setup logging -logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") +logging.basicConfig( + level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s" +) logging.getLogger("instanexus.preprocessing").setLevel(logging.WARNING) logging.getLogger("instanexus.assembly").setLevel(logging.WARNING) logging.getLogger("instanexus.clustering").setLevel(logging.WARNING) @@ -38,13 +37,14 @@ logging.getLogger("instanexus.consensus").setLevel(logging.WARNING) logger = logging.getLogger(__name__) + def cli(): """Command-line interface for the master pipeline.""" - + parser = argparse.ArgumentParser( description="Run the full InstaNexus preprocessing and assembly pipeline." ) - + # --- INPUT ARGUMENTS --- parser.add_argument( "--input-csv", @@ -62,13 +62,13 @@ def cli(): "--metadata-json-path", type=str, required=True, - help="Path to the sample_metadata.json file (required by preprocessing and assembly)." + help="Path to the sample_metadata.json file (required by preprocessing and assembly).", ) parser.add_argument( "--contaminants-fasta-path", type=str, required=True, - help="Path to the contaminants.fasta file (required by preprocessing)." + help="Path to the contaminants.fasta file (required by preprocessing).", ) parser.add_argument( "--chain", @@ -84,7 +84,7 @@ def cli(): parser.add_argument( "--conf", type=float, - default=None, # Default to None for preprocessing logic + default=None, # Default to None for preprocessing logic help="Confidence threshold for filtering (optional).", ) parser.add_argument( @@ -128,17 +128,17 @@ def cli(): "--min-seq-id", type=float, default=0.85, - help="Minimum sequence identity for mmseqs (default: 0.85)." + help="Minimum sequence identity for mmseqs (default: 0.85).", ) parser.add_argument( "--coverage", type=float, default=0.8, - help="Coverage parameter (-c) for mmseqs (default: 0.8)." + help="Coverage parameter (-c) for mmseqs (default: 0.8).", ) - + args = parser.parse_args() - + # Run the pipeline with the validated arguments run_pipeline(args) @@ -147,21 +147,21 @@ def run_pipeline(args): """ Orchestrates the full pipeline, calling each refactored module. """ - + logger.info("--- InstaNexus Pipeline started ---") - - run_name = Path(args.input_csv).stem # e.g., 'bsa' - base_output_folder = Path(args.folder_outputs) / run_name # e.g., 'outputs/bsa' - + + run_name = Path(args.input_csv).stem # e.g., 'bsa' + base_output_folder = Path(args.folder_outputs) / run_name # e.g., 'outputs/bsa' + # Build the experiment folder name based on parameters folder_name_parts = [f"{args.assembly_mode}"] - + if args.conf is not None: folder_name_parts.append(f"c{args.conf}") - + if "dbg" in args.assembly_mode: folder_name_parts.append(f"ks{args.kmer_size}") - + folder_name_parts.append(f"mo{args.min_overlap}") folder_name_parts.append(f"ts{args.size_threshold}") @@ -169,29 +169,30 @@ def run_pipeline(args): folder_name_parts.extend([f"mi{args.min_identity}", f"mm{args.max_mismatches}"]) run_folder_name = "_".join(folder_name_parts) - experiment_folder = base_output_folder / run_folder_name # e.g., 'outputs/bsa/greedy_c0.9_mo4_ts10' - + experiment_folder = ( + base_output_folder / run_folder_name + ) # e.g., 'outputs/bsa/greedy_c0.9_mo4_ts10' + cleaned_csv_path = experiment_folder / "cleaned.csv" - + scaffolds_folder = experiment_folder / "scaffolds" scaffolds_fasta_path = scaffolds_folder / "scaffolds.fasta" - - clustering_folder = scaffolds_folder / "clustering" # Clustering output - cluster_fasta_folder = clustering_folder / "cluster_fasta" # Input for alignment - + + clustering_folder = scaffolds_folder / "clustering" # Clustering output + cluster_fasta_folder = clustering_folder / "cluster_fasta" # Input for alignment + alignment_folder = scaffolds_folder / "alignment" - + consensus_folder = scaffolds_folder / "consensus" # ID for logs (optional) run_id_str = f"[{run_name} @ {run_folder_name}]" - + logger.info(f"Starting pipeline for run: {run_id_str}") logger.info(f"All results will be saved to: {experiment_folder}") - try: - logger.info(f"--- [Step 1/5] Running Preprocessing ---") + logger.info("--- [Step 1/5] Running Preprocessing ---") preprocessing.main( input_csv=args.input_csv, metadata_json=args.metadata_json_path, @@ -199,14 +200,14 @@ def run_pipeline(args): chain=args.chain, reference=args.reference, conf=args.conf, - output_csv_path=str(cleaned_csv_path) # Pass explicit path + output_csv_path=str(cleaned_csv_path), # Pass explicit path ) except Exception as e: logger.error(f"Preprocessing failed: {e}") - return + return try: - logger.info(f"--- [Step 2/5] Running Assembly ---") + logger.info("--- [Step 2/5] Running Assembly ---") assembly.main( input_csv_path=str(cleaned_csv_path), output_scaffolds_path=str(scaffolds_fasta_path), @@ -218,45 +219,45 @@ def run_pipeline(args): reference=args.reference, chain=args.chain, min_identity=args.min_identity, - max_mismatches=args.max_mismatches + max_mismatches=args.max_mismatches, ) except Exception as e: logger.error(f"Assembly failed: {e}") return try: - logger.info(f"--- [Step 3/5] Running Clustering ---") + logger.info("--- [Step 3/5] Running Clustering ---") clustering.main( - input_scaffolds_folder=str(scaffolds_folder), # Pass the folder + input_scaffolds_folder=str(scaffolds_folder), # Pass the folder min_seq_id=args.min_seq_id, - coverage=args.coverage + coverage=args.coverage, ) except Exception as e: logger.error(f"Clustering failed: {e}") return - + try: - logger.info(f"--- [Step 4/5] Running Alignment ---") + logger.info("--- [Step 4/5] Running Alignment ---") alignment.main( input_cluster_fasta_folder=str(cluster_fasta_folder), - output_alignment_folder=str(alignment_folder) + output_alignment_folder=str(alignment_folder), ) except Exception as e: logger.error(f"Alignment failed: {e}") return - + try: - logger.info(f"--- [Step 5/5] Running Consensus ---") + logger.info("--- [Step 5/5] Running Consensus ---") consensus.main( input_alignment_folder=str(alignment_folder), output_consensus_folder=str(consensus_folder), - run_id=run_id_str # Pass ID for logs + run_id=run_id_str, # Pass ID for logs ) except Exception as e: logger.error(f"Consensus failed: {e}") return - logger.info(f"--- InstaNexus Pipeline finished successfully! ---") + logger.info("--- InstaNexus Pipeline finished successfully! ---") logger.info(f"Final results in: {experiment_folder}") @@ -274,4 +275,4 @@ def run_pipeline(args): # --kmer-size 7 \ # --size-threshold 12 \ # --min-overlap 3 \ -# --min-seq-id 0.85 \ No newline at end of file +# --min-seq-id 0.85 diff --git a/src/instanexus/preprocessing.py b/src/instanexus/preprocessing.py index bf50b4f..3f864d0 100755 --- a/src/instanexus/preprocessing.py +++ b/src/instanexus/preprocessing.py @@ -1,16 +1,16 @@ #!/usr/bin/env python -r""" Preprocessing module for InstaNexus. +r"""Preprocessing module for InstaNexus. ██████████ ███████████ █████ █████ -░░███░░░░███ ░█░░░███░░░█░░███ ░░███ - ░███ ░░███░ ░███ ░ ░███ ░███ - ░███ ░███ ░███ ░███ ░███ - ░███ ░███ ░███ ░███ ░███ - ░███ ███ ░███ ░███ ░███ - ██████████ █████ ░░████████ -░░░░░░░░░░ ░░░░░ ░░░░░░░░ - +░░███░░░░███ ░█░░░███░░░█░░███ ░░███ + ░███ ░░███░ ░███ ░ ░███ ░███ + ░███ ░███ ░███ ░███ ░███ + ░███ ░███ ░███ ░███ ░███ + ░███ ███ ░███ ░███ ░███ + ██████████ █████ ░░████████ +░░░░░░░░░░ ░░░░░ ░░░░░░░░ + __authors__ = Marco Reverenna __copyright__ = Copyright 2025-2026 __research-group__ = DTU Biosustain (Multi-omics Network Analytics) and DTU Bioengineering @@ -20,28 +20,26 @@ __status__ = Dev """ -# import libraries -import os -import re import json import logging -import numpy as np -import pandas as pd -import plotly.express as px -import plotly.graph_objects as go +# import libraries +import re from pathlib import Path + +import numpy as np +import pandas as pd from Bio import SeqIO +# PROJECT_ROOT = Path(__file__).resolve().parents[2] +# JSON_DIR = PROJECT_ROOT / "json" -#PROJECT_ROOT = Path(__file__).resolve().parents[2] -#JSON_DIR = PROJECT_ROOT / "json" def get_sample_metadata(run, chain="", json_path=None): """Retrieve sample metadata from a JSON file based on the run and optional chain.""" if json_path is None: raise ValueError("json_path must be provided.") - + with open(json_path, "r") as f: all_meta = json.load(f) @@ -174,22 +172,21 @@ def filter_contaminants(seqs, run, contaminants_fasta): logger = logging.getLogger(__name__) - def main( input_csv: str, metadata_json: str, contaminants_fasta: str, chain: str, - #folder_outputs: str, + # folder_outputs: str, reference: bool, - #assembly_mode: str, + # assembly_mode: str, conf: float, output_csv_path: str, - #kmer_size: int, - #size_threshold: int, - #min_overlap: int, - #min_identity: float, - #max_mismatches: int, + # kmer_size: int, + # size_threshold: int, + # min_overlap: int, + # min_identity: float, + # max_mismatches: int, ): """Main function to run the preprocessing script.""" input_csv = Path(input_csv) @@ -197,7 +194,7 @@ def main( print("Starting preprocessing pipeline.") input_csv = Path(input_csv) - run = input_csv.stem # stem gives the filename without suffix + run = input_csv.stem # stem gives the filename without suffix # load metadata if chain: @@ -221,15 +218,15 @@ def main( print("Parameters loaded.") - #folder_outputs = Path(folder_outputs) / run - #folder_outputs.mkdir(parents=True, exist_ok=True) + # folder_outputs = Path(folder_outputs) / run + # folder_outputs.mkdir(parents=True, exist_ok=True) - #folder_name_parts = [f"comb_{assembly_mode}", f"c{conf}", f"ts{size_threshold}", f"mo{min_overlap}"] + # folder_name_parts = [f"comb_{assembly_mode}", f"c{conf}", f"ts{size_threshold}", f"mo{min_overlap}"] # if assembly_mode == "dbg": # folder_name_parts.insert(2, f"ks{kmer_size}") - #if reference: + # if reference: # folder_name_parts.extend([f"mi{min_identity}", f"mm{max_mismatches}"]) # combination_folder_out = folder_outputs / "_".join(folder_name_parts) @@ -238,7 +235,7 @@ def main( # print(f"Output folders created at: {combination_folder_out}") logger.info("Starting data cleaning...") - + if reference: protein_norm = normalize_sequence(protein) df = pd.read_csv(input_csv) @@ -246,16 +243,14 @@ def main( df["protease"] = df["experiment_name"].apply( lambda name: extract_protease(name, proteases) ) - + df = clean_dataframe(df) - + df["cleaned_preds"] = df["preds"].apply(remove_modifications) - + cleaned_psms = df["cleaned_preds"].tolist() - filtered_psms = filter_contaminants( - cleaned_psms, run, contaminants_fasta - ) + filtered_psms = filter_contaminants(cleaned_psms, run, contaminants_fasta) df = df[df["cleaned_preds"].isin(filtered_psms)] if reference: @@ -269,15 +264,15 @@ def main( df = df[df["conf"] > conf] else: logger.info("No confidence threshold applied.") - + df.reset_index(drop=True, inplace=True) - + logger.info("Data cleaning completed.") cleaned_csv_path = Path(output_csv_path) cleaned_csv_path.parent.mkdir(parents=True, exist_ok=True) - #cleaned_csv_path = combination_folder_out / "cleaned" / "cleaned_data.csv" - + # cleaned_csv_path = combination_folder_out / "cleaned" / "cleaned_data.csv" + df.to_csv(cleaned_csv_path, index=False) logger.info("Cleaned data saved to: {}.".format(cleaned_csv_path)) @@ -371,14 +366,15 @@ def cli(): "--output-csv-path", type=str, required=True, - help="Path to the output CSV file." + help="Path to the output CSV file.", ) args = parser.parse_args() main(**vars(args)) + if __name__ == "__main__": cli() -# python -m instanexus.preprocessing --input-csv inputs/bsa.csv --metadata-json json/sample_metadata.json --contaminants-fasta fasta/contaminants.fasta --output-csv-path outputs/bsa_cleaned.csv --conf 0.9 --reference \ No newline at end of file +# python -m instanexus.preprocessing --input-csv inputs/bsa.csv --metadata-json json/sample_metadata.json --contaminants-fasta fasta/contaminants.fasta --output-csv-path outputs/bsa_cleaned.csv --conf 0.9 --reference diff --git a/src/instanexus/visualization.py b/src/instanexus/visualization.py index 0b01eb5..44b6d6f 100644 --- a/src/instanexus/visualization.py +++ b/src/instanexus/visualization.py @@ -1,15 +1,15 @@ #!/usr/bin/env python -r""" Visualization module for InstaNexus. +r"""Visualization module for InstaNexus. ██████████ ███████████ █████ █████ -░░███░░░░███ ░█░░░███░░░█░░███ ░░███ - ░███ ░░███░ ░███ ░ ░███ ░███ - ░███ ░███ ░███ ░███ ░███ - ░███ ░███ ░███ ░███ ░███ - ░███ ███ ░███ ░███ ░███ - ██████████ █████ ░░████████ -░░░░░░░░░░ ░░░░░ ░░░░░░░░ +░░███░░░░███ ░█░░░███░░░█░░███ ░░███ + ░███ ░░███░ ░███ ░ ░███ ░███ + ░███ ░███ ░███ ░███ ░███ + ░███ ░███ ░███ ░███ ░███ + ░███ ███ ░███ ░███ ░███ + ██████████ █████ ░░████████ +░░░░░░░░░░ ░░░░░ ░░░░░░░░ __authors__ = Marco Reverenna __copyright__ = Copyright 2025-2026 @@ -21,11 +21,12 @@ """ import os -import numpy as np -import pandas as pd + import Bio.SeqIO import matplotlib.patches as patches import matplotlib.pyplot as plt +import numpy as np +import pandas as pd import plotly.express as px import plotly.graph_objects as go import seaborn as sns @@ -429,9 +430,7 @@ def plot_relative_map_distribution(run, df, reference, folder, title=False): x1 = fine_x[i + 1] y_segment = (fine_y[i] + fine_y[i + 1]) / 2.0 x_mid = (x0 + x1) / 2.0 - alpha = 1 - (x_mid - 0.88) / ( - 1 - 0.88 - ) + alpha = 1 - (x_mid - 0.88) / (1 - 0.88) fillcolor = ( f"rgba({base_color[0]}, {base_color[1]}, {base_color[2]}, {alpha:.2f})" ) @@ -571,7 +570,7 @@ def plot_confidence_distribution(df, folder_figures, min_conf=0, max_conf=1): fig.update_xaxes( showgrid=True, gridcolor="lightgray", - ticklabelposition="outside bottom", + ticklabelposition="outside bottom", dtick=0.02, ) fig.update_yaxes(showgrid=True, gridcolor="lightgray") @@ -603,8 +602,8 @@ def plot_protease_distribution(protease_counts, folder_figures): fig.update_traces(textposition="outside", width=0.4) mm_to_px = 3.78 - width_mm = 240 - height_mm = 200 + width_mm = 240 + height_mm = 200 fig.update_layout( width=int(width_mm * mm_to_px), @@ -619,12 +618,13 @@ def plot_protease_distribution(protease_counts, folder_figures): color="black", ), margin=dict(t=50, b=50, l=50, r=100), - plot_bgcolor="white", + plot_bgcolor="white", paper_bgcolor="white", ) fig.write_image(f"{folder_figures}/proteases_distribution.svg") + def map_to_protein(seq, protein, max_mismatches, min_identity): """Maps a sequence (`seq`) to a target protein sequence, allowing for mismatches, and identifies the best match based on the maximum mismatches and minimum identity threshold. @@ -893,8 +893,7 @@ def mapping_sequences(mapped_sequences, prot_seq, title, output_folder, output_f def create_dataframe_from_mapped_sequences(data): - """Takes a list of tuples containing sequence data and returns a structured DataFrame. - """ + """Takes a list of tuples containing sequence data and returns a structured DataFrame.""" # Create the initial DataFrame df = pd.DataFrame(data, columns=["Sequence", "Details"]) @@ -1185,4 +1184,4 @@ def mapping_psms_protease_associated( font=dict(size=14, family="Arial, sans-serif", color="black"), ) - fig.write_image(f"{output_folder}/{output_file}", scale=2) \ No newline at end of file + fig.write_image(f"{output_folder}/{output_file}", scale=2) diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index 157531c..ff64ff2 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -1,21 +1,17 @@ #!/usr/bin/env python3 -import pytest from instanexus.preprocessing import remove_modifications + def test_remove_modifications(): assert remove_modifications("A(ox)BC(mod)D") == "ABCD" assert remove_modifications("A[UNIMOD:21]BC[UNIMOD:35]D") == "ABCD" - assert ( - remove_modifications("A(ox)[UNIMOD:21]BC(mod)[UNIMOD:35]D") == "ABCD" - ) + assert remove_modifications("A(ox)[UNIMOD:21]BC(mod)[UNIMOD:35]D") == "ABCD" assert remove_modifications(None) is None assert remove_modifications("ACD") == "ACD" assert remove_modifications("A(I)BCD") == "ABCD" assert remove_modifications("A(ox)B(I)C(mod)D") == "ABCD" - assert ( - remove_modifications("A(ox)[UNIMOD:21]B(I)C(mod)[UNIMOD:35]D") == "ABCD" - ) + assert remove_modifications("A(ox)[UNIMOD:21]B(I)C(mod)[UNIMOD:35]D") == "ABCD" assert remove_modifications("AI BCD") == "AL BCD" - assert remove_modifications("A(ox)I B(mod)CD") == "AL BCD" \ No newline at end of file + assert remove_modifications("A(ox)I B(mod)CD") == "AL BCD"