-
Notifications
You must be signed in to change notification settings - Fork 10
Expand file tree
/
Copy pathrunSR.sh
More file actions
182 lines (147 loc) · 7.89 KB
/
runSR.sh
File metadata and controls
182 lines (147 loc) · 7.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
src=`cd $(dirname $0); pwd -P`
SRname="SR"
she_cutoff_left=0.25
she_cutoff_right=0.6
srbk_cutoff=0.75
SRavg_depth=100
mapquality=20
t=5
minclip_num=2
pipline=$(basename $0)
for com in perl minimap2 samtools
do
mg=$(command -v $com)
if [ "$mg" == "" ]
then
echo -e "\n\tError: Command $com is NOT in you PATH. Please check.\n"
exit 1
fi
done
Usage="\nUsage:\n\t$pipline -g Genome.fa -z Genome.fa.size -1 NGS_sort.bam \n or \t$pipline -g Genome.fa -z Genome.fa.size -1 fq1.gz -2 fq2.gz -f she_cutoff_left -h she_cutoff_right -r srbk_cutoff \n\t[default: -q 20 -f 0.4 -h 0.6 -r 0.75 -m 2 -a 100 -t 5]"
while getopts "g:z:1:2:q:f:h:r:m:a:t:" opt
do
case $opt in
g) ref_fa=$OPTARG ;;
z) ref_fa_size=$OPTARG ;;
1) query_1=$OPTARG ;;
2) query_2=$OPTARG ;;
q) mapquality=$OPTARG ;;
m) minclip_num=$OPTARG ;;
f) she_cutoff_left=$OPTARG ;;
h) she_cutoff_right=$OPTARG ;;
r) srbk_cutoff=$OPTARG ;;
a) SRavg_depth=$OPTARG ;;
t) t=$OPTARG ;;
?)
echo ":| WARNING: Unknown option. Ignoring: Exiting!"
exit 1;;
esac
done
if [ ! -e "$ref_fa" ];then
echo -e "\n\tgenome.fa is not found, please check the README.md for the requirements of input files!\n\t$Usage \n"
exit 1
fi
if [ ! -e "$ref_fa_size" ]
then
echo -e "\n\tGenome.fasta.size is not found, please check the README.md for the requirements of input files!\n\t$Usage \n"
exit 1
fi
if [ `echo "$minclip_num < 0"|bc` -eq 1 ] ; then
echo -e "\n\tminclip_num ERROR: $minclip_num Exit !"
exit 1
fi
if [ `echo "$t <= 0"|bc` -eq 1 ] ; then
echo -e "\n\tthread ERROR: $t Exit !"
exit 1
fi
if [ `echo "$she_cutoff_left < 0"|bc` -eq 1 ] ; then
echo -e "\n\tshe_cutoff_left ERROR: $she_cutoff_left Exit !"
exit 1
fi
query_1_tmp=$(echo $query_1 | tr [A-Z] [a-z])
query_2_tmp=$(echo $query_2 | tr [A-Z] [a-z])
if [[ "$query_1_tmp" =~ (fa$)|(fq$)|(fasta$)|(fastq$)|(fa.gz$)|(fq.gz$)|(fasta.gz$)|(fastq.gz$)|(bam$) ]]; then
if [ -d "SRout" ];then
echo -e "Error:: SRout already exists, Exit !"
exit 1
fi
mkdir SRout
if [[ "$query_1_tmp" =~ (bam$) ]];then
if [ ! -e "$query_1" ];then
echo -e "\n\t $query_1 is not found, please check the README.md for the requirements of input files!\n\t$Usage \n"
exit 1
fi
#if [ ! -e $query_1".bai" ];then
#echo -e "\n\t $query_1".bai" is not found, cannot read index for $query_1 \n\t$Usage \n"
#exit 1
#fi
input_bam=$query_1
echo -e "Skipping alignment::\n[M::worker_pipeline:: Filtering bamfiles]"
samtools view -h -F 1796 $input_bam -t $t | perl $src/srsam_cigar_filter.pl $mapquality - | samtools view -h -S -b -@ $t - -o SRout/$SRname"_sort.bam"
samtools index SRout/$SRname"_sort.bam"
fi
if [[ "$query_1_tmp" =~ (fa$)|(fq$)|(fasta$)|(fastq$)|(fa.gz$)|(fq.gz$)|(fasta.gz$)|(fastq.gz$) ]]; then
if [ ! -e "$query_1" ];then
echo -e "\n\t $query_1 is not found, please check the README.md for the requirements of input files!\n\t$Usage \n"
exit 1
fi
if [ ! -e "$query_2" ];then
echo -e "worker_pipeline::\nWARNING: Short pair_end read pair_2 is not found, only $query_1 used"
fi
if [[ -e "$query_2" ]];then
if [[ "$query_2_tmp" =~ (fa$)|(fq$)|(fasta$)|(fastq$)|(fa.gz$)|(fq.gz$)|(fasta.gz$)|(fastq.gz$) ]]; then
echo -e "worker_pipeline::"
else
echo -e "\n\t $query_2 should be fastq/fa suffix, please check the README.md for the requirements of input files!\n\t$Usage \n"
exit 1
fi
fi
echo -e "worker_pipeline:: NGS reads aligning and filtering"
minimap2 -ax sr -R "@RG\tID:foo\tSM:bar1\tLB:lib" $ref_fa $query_1 $query_2 -t $t | perl $src/srsam_cigar_filter.pl $mapquality - | samtools view -h -F 1796 -S -b -@ $t - -o SRout/$SRname"_unsort.bam"
echo -e "[M::worker_pipeline:: Sort bamfiles]"
samtools sort SRout/$SRname"_unsort.bam" -@ $t -o SRout/$SRname"_sort.bam"
rm SRout/$SRname"_unsort.bam" && samtools index SRout/$SRname"_sort.bam"
fi
else
echo -e "Error, query should be fasta, fq or a sorted.bam \n!"
exit 1
fi
#echo -e "\n##########################################################################################\n"
echo -e "[M::worker_pipeline:: Compute effective NGS coverage]"
#-------------------------------------------------------------------------------------------------------------
#samtools depth -a SRout/$SRname"_sort.bam" > SRout/$SRname"_sort.depth"
samtools view -h -@ $t -q $mapquality SRout/$SRname"_sort.bam" |samtools depth -@ $t -a - | perl $src/avgdep.pl SRout seq.size - >SRout/$SRname"_sort.depth"
SRavg_depth=$(cat SRout/Avgcov)
echo -e "[M::NGS mapping coverage: $SRavg_depth]"
rm SRout/Avgcov
perl $src/SReffect_size.pl SRout/$SRname"_sort.depth" >SRout/$SRname"_eff.size"
#echo -e "\n##########################################################################################\n"
#echo -e "[M::worker_pipeline:: Get clipping cover rate]"
#-------------------------------------------------------------------------------------------------------------
if (($minclip_num >= 0)); then
echo -e "[M::worker_pipeline:: Collect potential CRE|RH]"
#######start extract clipped reads
samtools view -@ $t -q $mapquality SRout/$SRname"_sort.bam" | perl $src/caculate_breakpoint_depth.pl - > SRout/$SRname"_clipped.cov"
perl -alne 'print if($F[3]>='$minclip_num')' SRout/$SRname"_clipped.cov" >SRout/$SRname"_clipped.cov.tmp"
perl $src/synthesize_SRbkdep_and_alldep.pl SRout/$SRname"_clipped.cov.tmp" SRout/$SRname"_sort.depth" >SRout/$SRname"_clip.coverRate"
perl -alne 'print if($F[3]>='$minclip_num' && $F[4]>=1 && $F[4] < 3*'$SRavg_depth' && $F[3]/$F[4] >'$she_cutoff_left' )' SRout/$SRname"_clip.coverRate" >SRout/$SRname"_putative.RE.bk.tmp"
#######start extract nonmapping locus
perl $src/search_dep0.pl SRout/$SRname"_sort.depth" >SRout/$SRname"_putative.RE.0.tmp"
perl $src/srbk_merge_srnoncov.pl SRout/$SRname"_putative.RE.bk.tmp" SRout/$SRname"_putative.RE.0.tmp" |sort -k 1,1 -k 2,2n >SRout/$SRname"_putative.RE.RH"
perl -alne 'print if( $F[3]/$F[4] <= '$she_cutoff_right' )' SRout/$SRname"_putative.RE.RH" >SRout/$SRname"_putative.RH"
perl -alne 'print if( $F[3]/$F[4] >= '$srbk_cutoff' )' SRout/$SRname"_putative.RE.RH" >SRout/$SRname"_putative.RE"
rm SRout/$SRname"_putative.RE.0.tmp" SRout/$SRname"_putative.RE.bk.tmp"
fi
#if (($minclip_num >=1)); then
# echo -e "[M::worker_pipeline:: Collect potential CRE|H]"
# samtools view -@ $t -q $mapquality SRout/$SRname"_sort.bam" | perl $src/caculate_breakpoint_depth.pl - > SRout/$SRname"_clipped.cov"
# perl -alne 'print if($F[3]>='$minclip_num')' SRout/$SRname"_clipped.cov" >SRout/$SRname"_clipped.cov.tmp"
# perl $src/synthesize_SRbkdep_and_alldep.pl SRout/$SRname"_clipped.cov.tmp" SRout/$SRname"_sort.depth" >SRout/$SRname"_clip.coverRate"
# perl -alne 'print if($F[3]>='$minclip_num' && $F[4]>=1 && $F[4] < 3*'$SRavg_depth' && $F[3]/$F[4] >'$she_cutoff_left')' SRout/$SRname"_clip.coverRate" > SRout/$SRname"_clip.coverRate.tmp"
# perl -alne 'print if( $F[3]/$F[4] <= '$she_cutoff_right' )' SRout/$SRname"_clip.coverRate.tmp" >SRout/$SRname"_putative.RH"
# perl -alne 'print if( $F[3]/$F[4] >= '$srbk_cutoff' )' SRout/$SRname"_clip.coverRate.tmp" >SRout/$SRname"_putative.RE"
# cat SRout/$SRname"_putative.RH" SRout/$SRname"_putative.RE" >SRout/$SRname"_putative.RE.RH"
# rm SRout/$SRname"_clipped.cov.tmp" SRout/$SRname"_clip.coverRate.tmp"
#fi
#echo -e "\n##########################################################################################\n"
echo -e "NGS data analysis completed. Check current directory SRout for final results!"