Skip to content

Commit 7646229

Browse files
authored
Merge pull request #88 from pinellolab/v218
Fix - Handle missing BED features per contig and internalize BED indexing logic
2 parents a4e2baf + e551c07 commit 7646229

File tree

5 files changed

+177
-88
lines changed

5 files changed

+177
-88
lines changed

PostProcess/add_risk_score.py

Lines changed: 102 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,107 @@
11
#!/usr/bin/env python
2+
"""Adds risk score columns to a report file and computes risk scores for each entry.
3+
4+
This module processes a tab-separated report file, calculates risk score differences,
5+
and writes the results to a new file with updated columns. It supports both primary
6+
and alternative target scenarios based on user input.
7+
"""
8+
9+
from typing import List, Tuple
210

311
import sys
412

13+
# risk score column names
14+
RISKCOLNAMES = ["Highest_CFD_Risk_Score", "Highest_CFD_Absolute_Risk_Score"]
15+
16+
17+
def add_risk_score_columns(header: List[str], alternative: bool) -> List[str]:
18+
"""Appends risk score columns and optionally a cluster ID to the header list.
19+
20+
This function updates the provided header list by adding predefined risk score
21+
columns, and adds a cluster ID column if the alternative flag is set.
22+
23+
Args:
24+
header: The list of column names to be updated.
25+
alternative: Whether to add the cluster ID column.
26+
27+
Returns:
28+
List[str]: The updated list of column names including risk score columns.
29+
"""
30+
header.extend(iter(RISKCOLNAMES))
31+
if alternative:
32+
header.append("CLUSTER_ID")
33+
return header
34+
35+
36+
def _compute_risk_score(line: str) -> Tuple[float, float]:
37+
"""Calculates the risk score difference and its absolute value from a line of
38+
report data.
39+
40+
This function extracts two score values from the input line, computes their
41+
difference, and returns both the difference and its absolute value.
42+
43+
Args:
44+
line: A tab-separated string containing report data.
45+
46+
Returns:
47+
Tuple[float, float]: The risk score difference and its absolute value.
48+
"""
49+
fields = line.strip().split("\t")
50+
score, score_alt = float(fields[20]), float(fields[21])
51+
score_diff = score - score_alt
52+
return score_diff, abs(score_diff)
53+
54+
55+
def compute_risk_score(report_fname: str, report_outfname: str, alternative: bool):
56+
"""Reads a report file, computes risk scores for each line, and writes the
57+
results to a new file.
58+
59+
This function processes the input report, appends risk score columns, and
60+
outputs the updated data. It handles both primary and alternative target
61+
scenarios based on the provided flag.
62+
63+
Args:
64+
report_fname: Path to the input report file.
65+
report_outfname: Path to the output report file.
66+
alternative: Whether to process as alternative targets.
67+
68+
Returns:
69+
None
70+
71+
Raises:
72+
OSError: If an error occurs while reading or writing files.
73+
"""
74+
try:
75+
with open(report_fname, mode="r") as fin, open(
76+
report_outfname, mode="w"
77+
) as fout:
78+
header = fin.readline().strip().split("\t") # read file header
79+
header = add_risk_score_columns(
80+
header, alternative
81+
) # append risk score columns
82+
fout.write("\t".join(header) + "\n") # write header to out put report
83+
for line in fin:
84+
score_diff, score_diff_abs = _compute_risk_score(line)
85+
fout.write(f"{line}\t{score_diff}\t{score_diff_abs}\n")
86+
except (IOError, Exception) as e:
87+
raise OSError(
88+
f"An error occurred while computing risk scores for {report_fname}"
89+
) from e
90+
91+
92+
def risk_score() -> None:
93+
"""Parses command-line arguments and computes risk scores for a report file.
94+
95+
This function reads input and output file paths and a flag from the command
96+
line, then processes the report to add risk score columns accordingly.
97+
98+
Returns:
99+
None
100+
"""
101+
report_fname, report_outfname, alternative = sys.argv[1:4] # read input args
102+
alternative = alternative == "True" # are primary or alternative targets?
103+
# compute risk score on CFD/CRISTA
104+
compute_risk_score(report_fname, report_outfname, alternative)
105+
5106

6-
file_in = sys.argv[1]
7-
file_out = sys.argv[2]
8-
alt = sys.argv[3]
9-
alt = alt == "True"
10-
with open(file_in, "r") as fin:
11-
with open(file_out, "w") as fout:
12-
header = fin.readline().strip().split("\t")
13-
# header.insert(22, 'Highest_CFD_Risk_Score')
14-
# header.insert(23, 'Highest_CFD_Absolute_Risk_Score')
15-
# header.append('MMBLG_CFD_Risk_Score')
16-
# header.append('MMBLG_CFD_Absolute_Risk_Score')
17-
header.append("Highest_CFD_Risk_Score")
18-
header.append("Highest_CFD_Absolute_Risk_Score")
19-
if alt:
20-
header.append("CLUSTER_ID")
21-
fout.write("\t".join(header) + "\n")
22-
for line in fin:
23-
splitted = line.strip().split("\t")
24-
cfd_diff = float(splitted[20]) - float(splitted[21])
25-
abs_diff = abs(cfd_diff)
26-
# mmblg_cfd_diff = float(splitted[42]) - float(splitted[43])
27-
# mmblg_abs_diff = abs(mmblg_cfd_diff)
28-
fout.write(
29-
"\t".join(splitted) + "\t" + str(cfd_diff) + "\t" + str(abs_diff) + "\n"
30-
)
31-
# if alt:
32-
# fout.write('\t'.join(splitted[:22])+'\t'+"{:.3f}".format(cfd_diff)+'\t'+"{:.3f}".format(abs_diff)+"\t"+"\t".join(
33-
# splitted[22:-1])+'\t'+"{:.3f}".format(mmblg_cfd_diff)+'\t'+"{:.3f}".format(mmblg_abs_diff)+"\t"+splitted[-1]+'\n')
34-
# else:
35-
# fout.write('\t'.join(splitted[:22])+'\t'+"{:.3f}".format(cfd_diff)+'\t'+"{:.3f}".format(abs_diff)+"\t"+"\t".join(
36-
# splitted[22:])+'\t'+"{:.3f}".format(mmblg_cfd_diff)+'\t'+"{:.3f}".format(mmblg_abs_diff)+'\n')
107+
risk_score()

PostProcess/annotation.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,8 @@ def annotate_target(chrom: str, start: int, stop: int, annotation: TabixFile) ->
102102
Returns:
103103
Comma-separated string of annotation features for the target.
104104
"""
105+
if chrom not in annotation.contigs:
106+
return "n" # contig not in input annotation file
105107
target_anns = {
106108
feature.split()[3]
107109
for feature in annotation.fetch(chrom, start, stop)

PostProcess/post_analisi_indel.sh

Lines changed: 0 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -34,38 +34,3 @@ sed -i 1i"$header" "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_na
3434
./analisi_indels_NNN.sh "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" "$output_folder/${fake_chr}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}" "$annotation_file" "$dict_folder/log_indels_$vcf_name" "$ref_folder/$true_chr.fa" $mm $bDNA $bRNA "$guide_file" "$pam_file" "$output_folder"
3535
rm "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr"
3636
rm "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr"
37-
38-
# vcf_name=$(basename $3)
39-
# guide_file=$4
40-
# guide_name=$(basename $4)
41-
# mm=$5
42-
# bDNA=$6
43-
# bRNA=$7
44-
# annotation_file=$8
45-
# annotation_name=$(basename $8)
46-
# pam_file=$9
47-
# pam_name=$(basename $9)
48-
# # sampleID=${10}
49-
# dict_folder=${10}
50-
51-
# final_res=${11}
52-
# final_res_alt=${12}
53-
54-
# key=${13}
55-
56-
# echo "Processing INDELs results for $key, starting post-analysis"
57-
# true_chr=$key
58-
# fake_chr="fake$true_chr"
59-
60-
# # create file to prevent void grep failing
61-
# touch "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr"
62-
# # create file to prevent void grep failing
63-
# touch "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr"
64-
# awk -v fake_chr="$fake_chr" '$0 ~ fake_chr {print}' "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" >"$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr"
65-
# header=$(head -1 $output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt)
66-
# sed -i 1i"$header" "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr"
67-
68-
# ./analisi_indels_NNN.sh "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" "$output_folder/${fake_chr}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}" "$annotation_file" "$dict_folder/log_indels_$vcf_name" "$ref_folder/$true_chr.fa" $mm $bDNA $bRNA "$guide_file" "$pam_file" "$output_folder" || {
69-
# echo "CRISPRme ERROR: indels analysis failed (script: ${0} line $((LINENO - 1)))" >&2
70-
# exit 1
71-
# }

PostProcess/remove_n_and_dots.py

Lines changed: 71 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,87 @@
11
#!/usr/bin/env python
2-
"""
3-
Created on Sun May 30 15:49:39 2021
2+
"""Processes a report file to replace 'n' and '.' values with 'NA'.
43
5-
@author: franc
4+
This module reads a tab-separated report file in chunks, cleans specific values,
5+
and overwrites the original file with the cleaned data. It is intended for use
6+
in post-processing steps where standardized missing value representation is required.
67
"""
78

8-
"""
9-
Script used to replace n and . with NA in bestMerge
10-
"""
11-
12-
import warnings
9+
from pandas.io.parsers import TextFileReader
1310

14-
warnings.simplefilter(action="ignore", category=FutureWarning)
1511
import pandas as pd
12+
13+
import subprocess
14+
import warnings
1615
import sys
1716
import os
1817

19-
in_path = sys.argv[1]
18+
warnings.simplefilter(action="ignore", category=FutureWarning)
19+
20+
# dataframe read chunksize
21+
CHUNKSIZE = 50000
22+
23+
24+
def read_report_chunks(report_fname: str) -> TextFileReader:
25+
"""Reads a tab-separated report file in chunks for efficient processing.
26+
27+
This function returns an iterator that yields DataFrame chunks from the
28+
specified file.
29+
30+
Args:
31+
report_fname: Path to the tab-separated report file.
32+
33+
Returns:
34+
TextFileReader: An iterator over DataFrame chunks.
35+
"""
36+
return pd.read_csv(report_fname, sep="\t", chunksize=CHUNKSIZE)
37+
38+
39+
def polish_report(report_chunks: TextFileReader, report_fname: str) -> None:
40+
"""Cleans and writes report data by replacing specific values with 'NA'.
41+
42+
This function processes each chunk of the report, replacing 'n' and '.' values
43+
with 'NA', and writes the cleaned data to a temporary file.
44+
45+
Args:
46+
report_chunks: An iterator over DataFrame chunks from the report.
47+
report_fname: Path to the original report file.
2048
21-
chunksize_ = 50000
22-
chunks = pd.read_csv(in_path, sep="\t", chunksize=chunksize_)
49+
Returns:
50+
None
51+
"""
52+
header = True # only for first iteration
53+
for chunk in report_chunks:
54+
assert "rsID" in chunk.columns.tolist()
55+
chunk: pd.DataFrame = chunk.replace("n", "NA") # replace ns with NAs
56+
chunk["rsID"] = chunk["rsID"].str.replace(
57+
".", "NA"
58+
) # replace . in rsids with NAs
59+
chunk.to_csv(
60+
f"{report_fname}.tmp", header=header, mode="a", sep="\t", index=False
61+
)
62+
header = False # not required anymore
2363

2464

25-
header = True
26-
for chunk in chunks:
65+
def remove_n_dots() -> None:
66+
"""Replaces 'n' and '.' values in a report file with 'NA' and overwrites the
67+
original file.
2768
28-
chunk = chunk.replace("n", "NA")
29-
# chunk = chunk.replace(regex=['\*.,\*', '\*,.\*'], value='NA')
30-
chunk["rsID"] = chunk["rsID"].str.replace(".", "NA")
69+
This function reads a report file, cleans specific values, and saves the cleaned
70+
data back to the original file.
3171
32-
chunk.to_csv(in_path + ".tmp", header=header, mode="a", sep="\t", index=False)
72+
Returns:
73+
None
74+
"""
75+
report_fname = sys.argv[1] # read input report filename
76+
if not os.path.isfile(report_fname) or os.stat(report_fname).st_size <= 0:
77+
raise FileNotFoundError(f"Report {report_fname} not found or empty")
78+
# remove ns and dots and replace them with NAs
79+
polish_report(read_report_chunks(report_fname), report_fname)
80+
if not os.path.isfile(f"{report_fname}.tmp"):
81+
raise FileNotFoundError(f"Polished report {report_fname}.tmp not created?")
82+
code = subprocess.call(f"mv {report_fname}.tmp {report_fname}")
83+
if code != 0:
84+
raise subprocess.SubprocessError(f"Failed renaming {report_fname}.tmp")
3385

34-
header = False
3586

36-
os.system(f"mv {in_path}.tmp {in_path}")
87+
remove_n_dots()

PostProcess/submit_job_automated_new_multiple_vcfs.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ email=${17} # email address (website only)
5050
echo -e "MAIL: $email"
5151
echo -e "CPU used: $ncpus"
5252

53-
#used to solve base editor check in resultintegration phase
53+
# used to solve base editor check in result-integration phase
5454
base_check_start=${18}
5555
base_check_end=${19}
5656
base_check_set=${20}
@@ -717,7 +717,7 @@ if [ -s $logerror ]; then
717717
rm -f $output_folder/*.bestCFD.txt $output_folder/*.bestmmblg.txt $output_folder/*.bestCRISTA.txt $output_folder/*.bestMerge.txt $output_folder/*.altMerge.txt
718718
exit 1
719719
fi
720-
# remove Ns and dots from rsID from primary targets files
720+
# remove Ns and dots from rsID from alternative targets files
721721
./remove_n_and_dots.py $final_res_alt.bestCFD.txt &
722722
./remove_n_and_dots.py $final_res_alt.bestmmblg.txt &
723723
./remove_n_and_dots.py $final_res_alt.bestCRISTA.txt &

0 commit comments

Comments
 (0)