Merge pull request #88 from pinellolab/v218

ManuelTgn · web-flow · commit 7646229496f9 · 2025-11-13T10:13:44.000+01:00
Fix - Handle missing BED features per contig and internalize BED indexing logic
diff --git a/PostProcess/add_risk_score.py b/PostProcess/add_risk_score.py
@@ -1,36 +1,107 @@
 #!/usr/bin/env python
+"""Adds risk score columns to a report file and computes risk scores for each entry.
+
+This module processes a tab-separated report file, calculates risk score differences,
+and writes the results to a new file with updated columns. It supports both primary
+and alternative target scenarios based on user input.
+"""
+
+from typing import List, Tuple
 
 import sys
 
+# risk score column names
+RISKCOLNAMES = ["Highest_CFD_Risk_Score", "Highest_CFD_Absolute_Risk_Score"]
+
+
+def add_risk_score_columns(header: List[str], alternative: bool) -> List[str]:
+    """Appends risk score columns and optionally a cluster ID to the header list.
+
+    This function updates the provided header list by adding predefined risk score
+    columns, and adds a cluster ID column if the alternative flag is set.
+
+    Args:
+        header: The list of column names to be updated.
+        alternative: Whether to add the cluster ID column.
+
+    Returns:
+        List[str]: The updated list of column names including risk score columns.
+    """
+    header.extend(iter(RISKCOLNAMES))
+    if alternative:
+        header.append("CLUSTER_ID")
+    return header
+
+
+def _compute_risk_score(line: str) -> Tuple[float, float]:
+    """Calculates the risk score difference and its absolute value from a line of
+    report data.
+
+    This function extracts two score values from the input line, computes their
+    difference, and returns both the difference and its absolute value.
+
+    Args:
+        line: A tab-separated string containing report data.
+
+    Returns:
+        Tuple[float, float]: The risk score difference and its absolute value.
+    """
+    fields = line.strip().split("\t")
+    score, score_alt = float(fields[20]), float(fields[21])
+    score_diff = score - score_alt
+    return score_diff, abs(score_diff)
+
+
+def compute_risk_score(report_fname: str, report_outfname: str, alternative: bool):
+    """Reads a report file, computes risk scores for each line, and writes the
+    results to a new file.
+
+    This function processes the input report, appends risk score columns, and
+    outputs the updated data. It handles both primary and alternative target
+    scenarios based on the provided flag.
+
+    Args:
+        report_fname: Path to the input report file.
+        report_outfname: Path to the output report file.
+        alternative: Whether to process as alternative targets.
+
+    Returns:
+        None
+
+    Raises:
+        OSError: If an error occurs while reading or writing files.
+    """
+    try:
+        with open(report_fname, mode="r") as fin, open(
+            report_outfname, mode="w"
+        ) as fout:
+            header = fin.readline().strip().split("\t")  # read file header
+            header = add_risk_score_columns(
+                header, alternative
+            )  # append risk score columns
+            fout.write("\t".join(header) + "\n")  # write header to out put report
+            for line in fin:
+                score_diff, score_diff_abs = _compute_risk_score(line)
+                fout.write(f"{line}\t{score_diff}\t{score_diff_abs}\n")
+    except (IOError, Exception) as e:
+        raise OSError(
+            f"An error occurred while computing risk scores for {report_fname}"
+        ) from e
+
+
+def risk_score() -> None:
+    """Parses command-line arguments and computes risk scores for a report file.
+
+    This function reads input and output file paths and a flag from the command
+    line, then processes the report to add risk score columns accordingly.
+
+    Returns:
+        None
+    """
+    report_fname, report_outfname, alternative = sys.argv[1:4]  # read input args
+    alternative = alternative == "True"  # are primary or alternative targets?
+    # compute risk score on CFD/CRISTA
+    compute_risk_score(report_fname, report_outfname, alternative)
+
 
-file_in = sys.argv[1]
-file_out = sys.argv[2]
-alt = sys.argv[3]
-alt = alt == "True"
-with open(file_in, "r") as fin:
-    with open(file_out, "w") as fout:
-        header = fin.readline().strip().split("\t")
-        # header.insert(22, 'Highest_CFD_Risk_Score')
-        # header.insert(23, 'Highest_CFD_Absolute_Risk_Score')
-        # header.append('MMBLG_CFD_Risk_Score')
-        # header.append('MMBLG_CFD_Absolute_Risk_Score')
-        header.append("Highest_CFD_Risk_Score")
-        header.append("Highest_CFD_Absolute_Risk_Score")
-        if alt:
-            header.append("CLUSTER_ID")
-        fout.write("\t".join(header) + "\n")
-        for line in fin:
-            splitted = line.strip().split("\t")
-            cfd_diff = float(splitted[20]) - float(splitted[21])
-            abs_diff = abs(cfd_diff)
-            # mmblg_cfd_diff = float(splitted[42]) - float(splitted[43])
-            # mmblg_abs_diff = abs(mmblg_cfd_diff)
-            fout.write(
-                "\t".join(splitted) + "\t" + str(cfd_diff) + "\t" + str(abs_diff) + "\n"
-            )
-            # if alt:
-            #     fout.write('\t'.join(splitted[:22])+'\t'+"{:.3f}".format(cfd_diff)+'\t'+"{:.3f}".format(abs_diff)+"\t"+"\t".join(
-            #         splitted[22:-1])+'\t'+"{:.3f}".format(mmblg_cfd_diff)+'\t'+"{:.3f}".format(mmblg_abs_diff)+"\t"+splitted[-1]+'\n')
-            # else:
-            #     fout.write('\t'.join(splitted[:22])+'\t'+"{:.3f}".format(cfd_diff)+'\t'+"{:.3f}".format(abs_diff)+"\t"+"\t".join(
-            #         splitted[22:])+'\t'+"{:.3f}".format(mmblg_cfd_diff)+'\t'+"{:.3f}".format(mmblg_abs_diff)+'\n')
+risk_score()
diff --git a/PostProcess/annotation.py b/PostProcess/annotation.py
@@ -102,6 +102,8 @@ def annotate_target(chrom: str, start: int, stop: int, annotation: TabixFile) ->
     Returns:
         Comma-separated string of annotation features for the target.
     """
+    if chrom not in annotation.contigs:
+        return "n"  # contig not in input annotation file 
     target_anns = {
         feature.split()[3]
         for feature in annotation.fetch(chrom, start, stop)
diff --git a/PostProcess/post_analisi_indel.sh b/PostProcess/post_analisi_indel.sh
@@ -34,38 +34,3 @@ sed -i 1i"$header" "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_na
 ./analisi_indels_NNN.sh "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" "$output_folder/${fake_chr}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}" "$annotation_file" "$dict_folder/log_indels_$vcf_name" "$ref_folder/$true_chr.fa" $mm $bDNA $bRNA "$guide_file" "$pam_file" "$output_folder" 
 rm "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr"
 rm "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr"
-
-# vcf_name=$(basename $3)
-# guide_file=$4
-# guide_name=$(basename $4)
-# mm=$5
-# bDNA=$6
-# bRNA=$7
-# annotation_file=$8
-# annotation_name=$(basename $8)
-# pam_file=$9
-# pam_name=$(basename $9)
-# # sampleID=${10}
-# dict_folder=${10}
-
-# final_res=${11}
-# final_res_alt=${12}
-
-# key=${13}
-
-# echo "Processing INDELs results for $key, starting post-analysis"
-# true_chr=$key
-# fake_chr="fake$true_chr"
-
-# # create file to prevent void grep failing
-# touch "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr"
-# # create file to prevent void grep failing
-# touch "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr"
-# awk -v fake_chr="$fake_chr" '$0 ~ fake_chr {print}' "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" >"$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr"
-# header=$(head -1 $output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt)
-# sed -i 1i"$header" "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr"
-
-# ./analisi_indels_NNN.sh "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" "$output_folder/${fake_chr}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}" "$annotation_file" "$dict_folder/log_indels_$vcf_name" "$ref_folder/$true_chr.fa" $mm $bDNA $bRNA "$guide_file" "$pam_file" "$output_folder" || {
-#     echo "CRISPRme ERROR: indels analysis failed (script: ${0} line $((LINENO - 1)))" >&2
-#     exit 1
-# }
diff --git a/PostProcess/remove_n_and_dots.py b/PostProcess/remove_n_and_dots.py
@@ -1,36 +1,87 @@
 #!/usr/bin/env python
-"""
-Created on Sun May 30 15:49:39 2021
+"""Processes a report file to replace 'n' and '.' values with 'NA'.
 
-@author: franc
+This module reads a tab-separated report file in chunks, cleans specific values,
+and overwrites the original file with the cleaned data. It is intended for use
+in post-processing steps where standardized missing value representation is required.
 """
 
-"""
-Script used to replace n and .  with NA in bestMerge
-"""
-
-import warnings
+from pandas.io.parsers import TextFileReader
 
-warnings.simplefilter(action="ignore", category=FutureWarning)
 import pandas as pd
+
+import subprocess
+import warnings
 import sys
 import os
 
-in_path = sys.argv[1]
+warnings.simplefilter(action="ignore", category=FutureWarning)
+
+# dataframe read chunksize
+CHUNKSIZE = 50000
+
+
+def read_report_chunks(report_fname: str) -> TextFileReader:
+    """Reads a tab-separated report file in chunks for efficient processing.
+
+    This function returns an iterator that yields DataFrame chunks from the
+    specified file.
+
+    Args:
+        report_fname: Path to the tab-separated report file.
+
+    Returns:
+        TextFileReader: An iterator over DataFrame chunks.
+    """
+    return pd.read_csv(report_fname, sep="\t", chunksize=CHUNKSIZE)
+
+
+def polish_report(report_chunks: TextFileReader, report_fname: str) -> None:
+    """Cleans and writes report data by replacing specific values with 'NA'.
+
+    This function processes each chunk of the report, replacing 'n' and '.' values
+    with 'NA', and writes the cleaned data to a temporary file.
+
+    Args:
+        report_chunks: An iterator over DataFrame chunks from the report.
+        report_fname: Path to the original report file.
 
-chunksize_ = 50000
-chunks = pd.read_csv(in_path, sep="\t", chunksize=chunksize_)
+    Returns:
+        None
+    """
+    header = True  # only for first iteration
+    for chunk in report_chunks:
+        assert "rsID" in chunk.columns.tolist()
+        chunk: pd.DataFrame = chunk.replace("n", "NA")  # replace ns with NAs
+        chunk["rsID"] = chunk["rsID"].str.replace(
+            ".", "NA"
+        )  # replace . in rsids with NAs
+        chunk.to_csv(
+            f"{report_fname}.tmp", header=header, mode="a", sep="\t", index=False
+        )
+        header = False  # not required anymore
 
 
-header = True
-for chunk in chunks:
+def remove_n_dots() -> None:
+    """Replaces 'n' and '.' values in a report file with 'NA' and overwrites the
+    original file.
 
-    chunk = chunk.replace("n", "NA")
-    # chunk = chunk.replace(regex=['\*.,\*', '\*,.\*'], value='NA')
-    chunk["rsID"] = chunk["rsID"].str.replace(".", "NA")
+    This function reads a report file, cleans specific values, and saves the cleaned
+    data back to the original file.
 
-    chunk.to_csv(in_path + ".tmp", header=header, mode="a", sep="\t", index=False)
+    Returns:
+        None
+    """
+    report_fname = sys.argv[1]  # read input report filename
+    if not os.path.isfile(report_fname) or os.stat(report_fname).st_size <= 0:
+        raise FileNotFoundError(f"Report {report_fname} not found or empty")
+    # remove ns and dots and replace them with NAs
+    polish_report(read_report_chunks(report_fname), report_fname)
+    if not os.path.isfile(f"{report_fname}.tmp"):
+        raise FileNotFoundError(f"Polished report {report_fname}.tmp not created?")
+    code = subprocess.call(f"mv {report_fname}.tmp {report_fname}")
+    if code != 0:
+        raise subprocess.SubprocessError(f"Failed renaming {report_fname}.tmp")
 
-    header = False
 
-os.system(f"mv {in_path}.tmp {in_path}")
+remove_n_dots()
diff --git a/PostProcess/submit_job_automated_new_multiple_vcfs.sh b/PostProcess/submit_job_automated_new_multiple_vcfs.sh
@@ -50,7 +50,7 @@ email=${17}  # email address (website only)
 echo -e "MAIL: $email"
 echo -e "CPU used: $ncpus"
 
-#used to solve base editor check in resultintegration phase
+# used to solve base editor check in result-integration phase
 base_check_start=${18}
 base_check_end=${19}
 base_check_set=${20}
@@ -717,7 +717,7 @@ if [ -s $logerror ]; then
 	rm -f $output_folder/*.bestCFD.txt $output_folder/*.bestmmblg.txt $output_folder/*.bestCRISTA.txt $output_folder/*.bestMerge.txt $output_folder/*.altMerge.txt
 	exit 1
 fi
-# remove Ns and dots from rsID from primary targets files
+# remove Ns and dots from rsID from alternative targets files
 ./remove_n_and_dots.py $final_res_alt.bestCFD.txt &
 ./remove_n_and_dots.py $final_res_alt.bestmmblg.txt &
 ./remove_n_and_dots.py $final_res_alt.bestCRISTA.txt &