Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 7 additions & 8 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,23 +6,22 @@
# -- Project information -----------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information

project = 'InstaNexus'
copyright = '2025, Marco Reverenna'
author = 'Marco Reverenna'
release = '0.2.0'
project = "InstaNexus"
copyright = "2025, Marco Reverenna"
author = "Marco Reverenna"
release = "0.2.0"

# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration

extensions = []

templates_path = ['_templates']
templates_path = ["_templates"]
exclude_patterns = []



# -- Options for HTML output -------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output

html_theme = 'alabaster'
html_static_path = ['_static']
html_theme = "alabaster"
html_static_path = ["_static"]
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,7 @@
" sequence_type=\"contigs\",\n",
" output_folder=RESULTS_DIR,\n",
" reference=protein_norm,\n",
" **params\n",
" **params,\n",
" )\n",
"\n",
" coverage_contigs = stat_contigs.get(\"coverage\")\n",
Expand Down Expand Up @@ -553,7 +553,7 @@
" sequence_type=\"scaffolds\",\n",
" output_folder=RESULTS_DIR,\n",
" reference=protein_norm,\n",
" **params\n",
" **params,\n",
" )\n",
"\n",
" coverage_scaffolds = stat_scaffolds.get(\"coverage\")\n",
Expand Down Expand Up @@ -814,7 +814,7 @@
" sequence_type=\"contigs\",\n",
" output_folder=RESULTS_DIR,\n",
" reference=protein_norm,\n",
" **params\n",
" **params,\n",
" )\n",
" coverage_contigs = stat_contigs.get(\"coverage\")\n",
"\n",
Expand Down Expand Up @@ -847,7 +847,7 @@
" sequence_type=\"scaffolds\",\n",
" output_folder=RESULTS_DIR,\n",
" reference=protein_norm,\n",
" **params\n",
" **params,\n",
" )\n",
"\n",
" coverage_scaffolds = stat_scaffolds.get(\"coverage\")\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -508,7 +508,7 @@
" sequence_type=\"contigs\",\n",
" output_folder=\".\",\n",
" reference=protein_norm,\n",
" **params\n",
" **params,\n",
" )\n",
"\n",
" coverage_contigs = stat_contigs.get(\"coverage\")\n",
Expand Down Expand Up @@ -556,7 +556,7 @@
" sequence_type=\"scaffolds\",\n",
" output_folder=\".\",\n",
" reference=protein_norm,\n",
" **params\n",
" **params,\n",
" )\n",
" coverage_scaffolds = stat_scaffolds.get(\"coverage\")\n",
"\n",
Expand Down Expand Up @@ -740,7 +740,7 @@
" sequence_type=\"contigs\",\n",
" output_folder=\".\",\n",
" reference=protein_norm,\n",
" **params\n",
" **params,\n",
" )\n",
" coverage_contigs = stat_contigs.get(\"coverage\")\n",
"\n",
Expand Down Expand Up @@ -784,7 +784,7 @@
" sequence_type=\"scaffolds\",\n",
" output_folder=\".\",\n",
" reference=protein_norm,\n",
" **params\n",
" **params,\n",
" )\n",
" coverage_scaffolds = stat_scaffolds.get(\"coverage\")\n",
"\n",
Expand Down
49 changes: 26 additions & 23 deletions docs/source/tutorials/examples/dbg_variants_workflow.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
"outputs": [],
"source": [
"# read a pre cleaned data file\n",
"#data = pd.read_csv(\"../outputs/bsa/comb_dbg_c0.9_ks7_ts12_mo3/cleaned/cleaned_data.csv\")"
"# data = pd.read_csv(\"../outputs/bsa/comb_dbg_c0.9_ks7_ts12_mo3/cleaned/cleaned_data.csv\")"
]
},
{
Expand All @@ -62,9 +62,9 @@
"\n",
"import re\n",
"\n",
"file_name = 'bsa'\n",
"file_name = \"bsa\"\n",
"\n",
"data = pd.read_csv(f'../inputs/{file_name}.csv'.format(file_name=file_name))\n",
"data = pd.read_csv(f\"../inputs/{file_name}.csv\".format(file_name=file_name))\n",
"\n",
"data[\"log_probs\"] = data[\"log_probs\"].replace(-1, -10)\n",
"\n",
Expand Down Expand Up @@ -99,13 +99,12 @@
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"\n",
"repo_folder = Path(\"../\")\n",
"\n",
"filtered_psms = instanexus.preprocessing.filter_contaminants(\n",
" cleaned_psms, run, repo_folder / \"fasta/contaminants.fasta\"\n",
" )\n",
" cleaned_psms, run, repo_folder / \"fasta/contaminants.fasta\"\n",
")\n",
"\n",
"data = data[data[\"preds\"].isin(filtered_psms)]"
]
Expand Down Expand Up @@ -158,10 +157,10 @@
"source": [
"assembler = Assembler(\n",
" mode=\"dbg_weighted\",\n",
" kmer_size=7, \n",
" size_threshold=0, \n",
" min_weight=2, # filter low-weight edges\n",
" refine_rounds=3, # optional iterative refinement\n",
" kmer_size=7,\n",
" size_threshold=0,\n",
" min_weight=2, # filter low-weight edges\n",
" refine_rounds=3, # optional iterative refinement\n",
")"
]
},
Expand All @@ -172,7 +171,9 @@
"metadata": {},
"outputs": [],
"source": [
"scaffolds_dbg_w = assembler.run(sequences, output_folder=output_folder, protein_norm=None)"
"scaffolds_dbg_w = assembler.run(\n",
" sequences, output_folder=output_folder, protein_norm=None\n",
")"
]
},
{
Expand Down Expand Up @@ -242,7 +243,9 @@
"metadata": {},
"outputs": [],
"source": [
"mapped_contigs = map.process_protein_contigs_scaffold(scaffolds_dbg_w, protein_norm, max_mismatches = 10, min_identity = 0.8)"
"mapped_contigs = map.process_protein_contigs_scaffold(\n",
" scaffolds_dbg_w, protein_norm, max_mismatches=10, min_identity=0.8\n",
")"
]
},
{
Expand Down Expand Up @@ -338,8 +341,8 @@
"assembler_dbgx = Assembler(\n",
" mode=\"dbgX\",\n",
" kmer_size=7,\n",
" size_threshold=10, \n",
" min_weight=2, \n",
" size_threshold=10,\n",
" min_weight=2,\n",
")"
]
},
Expand All @@ -351,9 +354,7 @@
"outputs": [],
"source": [
"scaffolds_dbgx = assembler_dbgx.run(\n",
" sequences=sequences,\n",
" output_folder=output_folder,\n",
" protein_norm=None\n",
" sequences=sequences, output_folder=output_folder, protein_norm=None\n",
")"
]
},
Expand All @@ -364,7 +365,9 @@
"metadata": {},
"outputs": [],
"source": [
"mapped_scaffolds_dbgx = map.process_protein_contigs_scaffold(scaffolds_dbgx, protein_norm, max_mismatches = 10, min_identity = 0.8)"
"mapped_scaffolds_dbgx = map.process_protein_contigs_scaffold(\n",
" scaffolds_dbgx, protein_norm, max_mismatches=10, min_identity=0.8\n",
")"
]
},
{
Expand Down Expand Up @@ -427,7 +430,7 @@
" mode=\"fusion\",\n",
" kmer_size=7,\n",
" size_threshold=10,\n",
" min_overlap=3, \n",
" min_overlap=3,\n",
" min_weight=2,\n",
")"
]
Expand All @@ -450,9 +453,7 @@
"outputs": [],
"source": [
"scaffolds_fusion = assembler_fusion.run(\n",
" sequences=sequences,\n",
" output_folder=output_folder_fusion,\n",
" protein_norm=None\n",
" sequences=sequences, output_folder=output_folder_fusion, protein_norm=None\n",
")"
]
},
Expand All @@ -463,7 +464,9 @@
"metadata": {},
"outputs": [],
"source": [
"mapped_scaffolds_fusion = map.process_protein_contigs_scaffold(scaffolds_fusion, protein_norm, max_mismatches=10, min_identity=0.8)\n",
"mapped_scaffolds_fusion = map.process_protein_contigs_scaffold(\n",
" scaffolds_fusion, protein_norm, max_mismatches=10, min_identity=0.8\n",
")\n",
"\n",
"# top 20\n",
"mapped_scaffolds_fusion = mapped_scaffolds_fusion[:20]"
Expand Down
63 changes: 27 additions & 36 deletions docs/source/tutorials/examples/hybrid_workflow_with_figures.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -39,19 +39,17 @@
"import alignment as align\n",
"import clustering as clus\n",
"import preprocessing as prep\n",
"import compute_statistics as comp_stat\n",
"#import model_peptide_selector as selector\n",
"\n",
"# import model_peptide_selector as selector\n",
"\n",
"# import libraries\n",
"from pathlib import Path\n",
"from Bio import SeqIO\n",
"\n",
"#import joblib\n",
"# import joblib\n",
"import json\n",
"import Bio\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns"
"import pandas as pd"
]
},
{
Expand Down Expand Up @@ -131,16 +129,10 @@
"metadata": {},
"outputs": [],
"source": [
"def get_combination_name(\n",
" ass_method,\n",
" conf,\n",
" kmer_size,\n",
" size_threshold,\n",
" min_overlap\n",
"):\n",
"def get_combination_name(ass_method, conf, kmer_size, size_threshold, min_overlap):\n",
" if ass_method in (\"dbg\", \"hybrid\"):\n",
" return f\"comb_{ass_method}_c{conf}_ks{kmer_size}_ts{size_threshold}_mo{min_overlap}\"\n",
" \n",
"\n",
" elif ass_method == \"greedy\":\n",
" return f\"comb_{ass_method}_c{conf}_ts{size_threshold}_mo{min_overlap}\""
]
Expand Down Expand Up @@ -186,12 +178,7 @@
"metadata": {},
"outputs": [],
"source": [
"comb = get_combination_name(\n",
" ass_method,\n",
" conf,\n",
" kmer_size,\n",
" size_threshold,\n",
" min_overlap)\n",
"comb = get_combination_name(ass_method, conf, kmer_size, size_threshold, min_overlap)\n",
"\n",
"print(comb)"
]
Expand All @@ -207,7 +194,7 @@
" \"ass_method\": ass_method,\n",
" \"conf\": conf,\n",
" \"size_threshold\": size_threshold,\n",
" \"min_overlap\": min_overlap\n",
" \"min_overlap\": min_overlap,\n",
"}"
]
},
Expand Down Expand Up @@ -346,7 +333,9 @@
" filtered_df = df[mask].copy()\n",
" removed_count = (~mask).sum()\n",
"\n",
" print(f\"Removed {removed_count} contaminant sequences, {len(filtered_df)} remaining.\")\n",
" print(\n",
" f\"Removed {removed_count} contaminant sequences, {len(filtered_df)} remaining.\"\n",
" )\n",
" return filtered_df"
]
},
Expand Down Expand Up @@ -492,9 +481,9 @@
"metadata": {},
"outputs": [],
"source": [
"greedy_scaffolds = greedy.scaffold_iterative_greedy(assembled_contigs,\n",
" min_overlap,\n",
" size_threshold)"
"greedy_scaffolds = greedy.scaffold_iterative_greedy(\n",
" assembled_contigs, min_overlap, size_threshold\n",
")"
]
},
{
Expand Down Expand Up @@ -655,10 +644,12 @@
"outputs": [],
"source": [
"mapped_scaffolds = map.process_protein_contigs_scaffold(\n",
" all_scaffolds, protein_norm, max_mismatches = 0, min_identity = 0.90\n",
" all_scaffolds, protein_norm, max_mismatches=0, min_identity=0.90\n",
")\n",
"\n",
"map.mapping_substitutions(mapped_scaffolds, protein_norm, title= \"scaffolds mapped in RF-selected peptides\")"
"map.mapping_substitutions(\n",
" mapped_scaffolds, protein_norm, title=\"scaffolds mapped in RF-selected peptides\"\n",
")"
]
},
{
Expand Down Expand Up @@ -754,14 +745,14 @@
"source": [
"clus.cluster_fasta_files(input_folder=str(scaffolds_folder_out))\n",
"\n",
"fasta_input = scaffolds_folder_out / f\"scaffolds.fasta\"\n",
"fasta_input = scaffolds_folder_out / \"scaffolds.fasta\"\n",
"\n",
"cluster_tsv_folder = clustering_out / run_id\n",
" \n",
"\n",
"clus.process_fasta_and_clusters(\n",
" fasta_file=str(fasta_input),\n",
" input_folder=str(scaffolds_folder_out),\n",
" )"
" fasta_file=str(fasta_input),\n",
" input_folder=str(scaffolds_folder_out),\n",
")"
]
},
{
Expand Down Expand Up @@ -798,10 +789,10 @@
"outputs": [],
"source": [
"cons.process_alignment_files(\n",
" align_folder=str(alignment_out),\n",
" output_folder=str(consensus_out),\n",
" run_id=run_id,\n",
" )"
" align_folder=str(alignment_out),\n",
" output_folder=str(consensus_out),\n",
" run_id=run_id,\n",
")"
]
}
],
Expand Down
Loading
Loading