-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwes_fastq_to_gvcf.sh
More file actions
149 lines (122 loc) · 9.57 KB
/
wes_fastq_to_gvcf.sh
File metadata and controls
149 lines (122 loc) · 9.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/bin/bash
#nohup sh /data/home/test2/LiDingYang/WES_pedigree/wes_fastq_to_gvcf.sh PID21-056A_1.fastq.gz PID21-056A_2.fastq.gz HYMKYDSX2 Test_lib PID21-056A /data/home/test2/LiDingYang/WES_pedigree &
#nohup sh /data/home/test2/LiDingYang/WES_pedigree/wes_fastq_to_gvcf.sh PID21-057_1.fastq.gz PID21-057_2.fastq.gz HYMKYDSX2 lib PID21-057 /data/home/test2/LiDingYang/WES_pedigree &
#nohup sh /data/home/test2/LiDingYang/WES_pedigree/wes_fastq_to_gvcf.sh PID21-058_1.fastq.gz PID21-058_2.fastq.gz HYMKYDSX2 lib PID21-058 /data/home/test2/LiDingYang/WES_pedigree &
#nohup sh /data/home/test2/LiDingYang/WES_pedigree/wes_fastq_to_gvcf.sh PID21-232_1.fastq.gz PID21-232_2.fastq.gz HYMKYDSX2 lib PID21-232 /data/home/test2/LiDingYang/WES_pedigree &
#nohup sh /data/home/test2/LiDingYang/WES_pedigree/wes_fastq_to_gvcf.sh PID21-233_1.fastq.gz PID21-233_2.fastq.gz HYMKYDSX2 lib PID21-233 /data/home/test2/LiDingYang/WES_pedigree &
#nohup sh /data/home/test2/LiDingYang/WES_pedigree/wes_fastq_to_gvcf.sh PID21-234_1.fastq.gz PID21-234_2.fastq.gz HYMKYDSX2 lib PID21-234 /data/home/test2/LiDingYang/WES_pedigree &
#nohup sh /data/home/test2/LiDingYang/WES_pedigree/wes_fastq_to_gvcf.sh A140135A_1.fastq.gz A140135A_2.fastq.gz H9EF6ADXX lib A140135A /data/home/test2/LiDingYang/WES_pedigree &
#nohup sh /data/home/test2/LiDingYang/WES_pedigree/wes_fastq_to_gvcf.sh A140136A_1.fastq.gz A140136A_2.fastq.gz H9EF6ADXX lib A140136A /data/home/test2/LiDingYang/WES_pedigree &
#nohup sh /data/home/test2/LiDingYang/WES_pedigree/wes_fastq_to_gvcf.sh A140417A_1.fastq.gz A140417A_2.fastq.gz H9EF6ADXX lib A140417A /data/home/test2/LiDingYang/WES_pedigree &
#nohup sh /data/home/test2/LiDingYang/WES_pedigree/wes_fastq_to_gvcf.sh A140418A_1.fastq.gz A140418A_2.fastq.gz H9EF6ADXX lib A140418A /data/home/test2/LiDingYang/WES_pedigree &
#nohup sh /share/home/grp-wangyf/yf_intern/ldy/WGS/wes_fastq_to_gvcf.sh ./A140019A/A140019A_1.fq.gz ./A140019A/A140019A_2.fq.gz HYMKYDSX2 Test_lib A140019A /share/home/grp-wangyf/yf_intern/ldy/WGS
#nohup sh /share/home/grp-wangyf/yf_intern/ldy/WGS/wes_fastq_to_gvcf.sh ./A140227A/A140227A_1.fq.gz ./A140227A/A140227A_2.fq.gz HYMKYDSX2 Test_lib A140227A /share/home/grp-wangyf/yf_intern/ldy/WGS
#nohup sh /share/home/grp-wangyf/yf_intern/ldy/WGS/wes_fastq_to_gvcf.sh ./A140228A/A140228A_1.fq.gz ./A140228A/A140228A_2.fq.gz HYMKYDSX2 Test_lib A140228A /share/home/grp-wangyf/yf_intern/ldy/WGS
#nohup sh /share/home/grp-wangyf/yf_intern/ldy/WGS/wes_fastq_to_gvcf.sh ./A150001A/A150001A_1.fq.gz ./A150001A/A150001A_2.fq.gz HYMKYDSX2 Test_lib A150001A /share/home/grp-wangyf/yf_intern/ldy/WGS
#nohup sh /share/home/grp-wangyf/yf_intern/ldy/WGS/wes_fastq_to_gvcf.sh ./A160688B/A160688B_1.fq.gz ./A160688B/A160688B_2.fq.gz HYMKYDSX2 Test_lib A160688B /share/home/grp-wangyf/yf_intern/ldy/WGS
#nohup sh /share/home/grp-wangyf/yf_intern/ldy/WGS/wes_fastq_to_gvcf.sh ./A190118A/A190118A_1.fq.gz ./A190118A/A190118A_2.fq.gz HYMKYDSX2 Test_lib A190118A /share/home/grp-wangyf/yf_intern/ldy/WGS
#nohup sh /share/home/grp-wangyf/yf_intern/ldy/WGS/wes_fastq_to_gvcf.sh ./A190117A/A190117A_1.fq.gz ./A190117A/A190117A_2.fq.gz HYMKYDSX2 Test_lib A190117A /share/home/grp-wangyf/yf_intern/ldy/WGS
#nohup sh /share/home/grp-wangyf/yf_intern/ldy/WGS/wes_fastq_to_gvcf.sh ./A190119A/A190119A_1.fq.gz ./A190119A/A190119A_2.fq.gz HYMKYDSX2 Test_lib A190119A /share/home/grp-wangyf/yf_intern/ldy/WGS
#nohup sh /share/home/grp-wangyf/yf_intern/ldy/WGS/wes_fastq_to_gvcf.sh ./NM123/NM123_1.fq.gz ./NM123/NM123_2.fq.gz HYMKYDSX2 Test_lib NM123 /share/home/grp-wangyf/yf_intern/ldy/WGS
#nohup sh /share/home/grp-wangyf/yf_intern/ldy/WGS/wes_fastq_to_gvcf.sh ./NM133/NM133_1.fq.gz ./NM133/NM133_2.fq.gz HYMKYDSX2 Test_lib NM133 /share/home/grp-wangyf/yf_intern/ldy/WGS
## 这是多样本变异检测流程的前半部分,这个部分假设你的每个样本只有一对用Illumina测序仪测序的PE fastq
# 一些软件和工具的路径, 根据实际
trimmomatic=/share/home/grp-wangyf/yf_intern/ldy/Trimmomatic-0.39/trimmomatic-0.39.jar
bwa=bwa
samtools=samtools
gatk=gatk
java=/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.181-7.b13.el7.x86_64/jre/bin/java
#readlink -f $(which java)
#reference
reference=/share/home/grp-wangyf/yf_intern/ldy/WGS/resource
GATK_bundle=/share/home/grp-wangyf/yf_intern/ldy/WGS/resource
## 这一步不是必须的,取决于GATK_bundle中的这4份文件是否已经有建索引,如没有再执行
#$gatk IndexFeatureFile --feature-file $GATK_bundle/hapmap_3.3.hg38.vcf
#$gatk IndexFeatureFile --feature-file $GATK_bundle/1000G_omni2.5.hg38.vcf
#$gatk IndexFeatureFile --feature-file $GATK_bundle/1000G_phase1.snps.high_confidence.hg38.vcf
#$gatk IndexFeatureFile --feature-file $GATK_bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf
#$gatk IndexFeatureFile --feature-file $GATK_bundle/dbsnp_146.hg38.vcf
## shell执行参数
fq1=$1
fq2=$2
RGID=$3 ## Read Group,一般用Lane ID代替
library=$4 ## 测序文库编号
sample=$5 ## 样本ID
outdir=$6 ## 输出目录的路径
## 按样本设置目录
outdir=${outdir}/${sample}
## 通过fastq1获得fastq的前缀名字,这里假设了原始的fastq1和fastq2有相同的前缀名字
## 并且假设fastq1的文件名格式为*.1.fq.gz;
fq_file_name=`basename $fq1`
fq_file_name=${fq_file_name%%_1.fastq.gz}
# output diretory
if [ ! -d $outdir/cleanfq ]
then mkdir -p $outdir/cleanfq
fi
if [ ! -d $outdir/bwa ]
then mkdir -p $outdir/bwa
fi
if [ ! -d $outdir/gatk ]
then mkdir -p $outdir/gatk
fi
## 使用Trimmomatic对原始数据进行质控,ILLUMINACLIP中的一个关键参数 keepBothReads设为True。
time java -jar $trimmomatic PE \
-threads 18 \
$fq1 $fq2 \
$outdir/cleanfq/${fq_file_name}.paired.1.fq.gz $outdir/${fq_file_name}.unpaired.1.fq.gz \
$outdir/cleanfq/${fq_file_name}.paired.2.fq.gz $outdir/${fq_file_name}.unpaired.2.fq.gz \
ILLUMINACLIP:/share/home/grp-wangyf/yf_intern/ldy/Trimmomatic-0.39/adapters/TruSeq3-PE-2.fa:2:30:10:8:True \
SLIDINGWINDOW:5:15 LEADING:5 TRAILING:5 MINLEN:50 && echo "** fq QC done **"
## 使用bwa mem完成数据比对,bwa mem对任何长度大于40bp小于2000bp的read都是非常有效的; PL:ILLUMINA是我默认的
time $bwa mem -t 18 -M -Y -R "@RG\tID:$RGID\tPL:ILLUMINA\tLB:$library\tSM:$sample" $reference/Homo_sapiens_assembly38.fasta \
$outdir/cleanfq/${fq_file_name}.paired.1.fq.gz $outdir/cleanfq/${fq_file_name}.paired.2.fq.gz | $samtools view -Sb - > $outdir/bwa/${sample}.bam && \
echo "** BWA MEM done **" && \
time $samtools sort -@ 18 -m 30G -O bam -o $outdir/bwa/${sample}.sorted.bam $outdir/bwa/${sample}.bam && echo "** sorted raw bamfile done "
## 这一步不是必须的
# time $samtools index $outdir/bwa/${sample}.sorted.bam && echo "** ${sample}.sorted.bam index done **"
## 标记重复序列
time $gatk MarkDuplicates \
-I $outdir/bwa/${sample}.sorted.bam \
-M $outdir/bwa/${sample}.markdup_metrics.txt \
-O $outdir/bwa/${sample}.sorted.markdup.bam && echo "** ${sample}.sorted.bam MarkDuplicates done **"
## 为${sample}.sorted.markdup.bam构建Index,这是继续后续步骤所必须的
time $samtools index $outdir/bwa/${sample}.sorted.markdup.bam && echo "** ${sample}.sorted.markdup.bam index done **"
## 执行BQSR
## [注]Does your vcf file have an index? GATK4 does not support on the fly indexing of VCFs anymore.
time $gatk BaseRecalibrator \
-R $reference/Homo_sapiens_assembly38.fasta \
-I $outdir/bwa/${sample}.sorted.markdup.bam \
--known-sites $GATK_bundle/Homo_sapiens_assembly38.known_indels.vcf.gz \
--known-sites $GATK_bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz \
--known-sites $GATK_bundle/Homo_sapiens_assembly38.dbsnp138.vcf \
-O $outdir/bwa/${sample}.sorted.markdup.recal_data.table \
&& echo "** ${sample}.sorted.markdup.recal_data.table done **"
time $gatk ApplyBQSR \
--bqsr-recal-file $outdir/bwa/${sample}.sorted.markdup.recal_data.table \
-R $reference/Homo_sapiens_assembly38.fasta \
-I $outdir/bwa/${sample}.sorted.markdup.bam \
-O $outdir/bwa/${sample}.sorted.markdup.BQSR.bam && echo "** ApplyBQSR done **"
## 为${sample}.sorted.markdup.BQSR.bam构建Index,这是继续后续步骤所必须的
time $samtools index $outdir/bwa/${sample}.sorted.markdup.BQSR.bam && echo "** ${sample}.sorted.markdup.BQSR.bam index done **"
## 输出样本的gVCF,有两个方式来完成,结果一样,速度不同。
## 输出样本的全gVCF,面对较大的输入文件时,速度较慢
time $gatk HaplotypeCaller \
--emit-ref-confidence GVCF \
-R $reference/Homo_sapiens_assembly38.fasta \
-I $outdir/bwa/${sample}.sorted.markdup.BQSR.bam \
-O $outdir/gatk/${sample}.HC.g.vcf.gz && echo "** GVCF ${sample}.HC.g.vcf.gz done **"
# ## 这是第二种为单个样本生成gvcf的方式,目的是通过分染色体获得速度的提升
# chrom=( chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX chrY chrM )
# for i in ${chrom[@]}; do
# time $gatk HaplotypeCaller \
# --emit-ref-confidence GVCF \
# -R $reference/Homo_sapiens_assembly38.fasta \
# -I $outdir/bwa/${sample}.sorted.markdup.BQSR.bam \
# -L $i \
# -O $outdir/gatk/${sample}.HC.${i}.g.vcf.gz && echo "** ${sample}.HC.${i}.g.vcf.gz done **" &
# done
# 可以被删除清理的文件,这不是必须执行的
# 对于比对文件只有最终的${sample}.sorted.markdup.BQSR.bam值得留下来
rm -f $outdir/bwa/${sample}.bam $outdir/bwa/${sample}.sorted.bam $outdir/bwa/${sample}.sorted.markdup.bam*
rm -f $outdir/cleanfq/${fq_file_name}.paired.1.fq.gz $outdir/cleanfq/${fq_file_name}.paired.2.fq.gz
rm -f $outdir/cleanfq/${fq_file_name}.unpaired.1.fq.gz $outdir/cleanfq/${fq_file_name}.unpaired.2.fq.gz