-
Notifications
You must be signed in to change notification settings - Fork 9
Expand file tree
/
Copy pathJAAWS.sh
More file actions
executable file
·123 lines (103 loc) · 4.46 KB
/
JAAWS.sh
File metadata and controls
executable file
·123 lines (103 loc) · 4.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/bin/bash
#Jon's Awesome Annotation Wrapper Script (JAAWS)
#point here is to run trimmomatic to trim PE Illumina reads, then Spades, followed by some automated cleanup steps
DIR=$1
R1=$2
R2=$3
QR1=qtrimR1pe.fq.gz
QR2=qtrimR2pe.fq.gz
QCR1=clean.1.fq.gz
QCR2=clean.2.fq.gz
SG1=spades.scaffolds.fasta
CPU=$4
PHYLUM=Ascomycota
OUTPUT=$5
#move into directory, make sure reads are passed as full paths....
mkdir -p $1
cd $1
if [ ! -f clean.1.fq.gz ]; then
#run trimmomatic, trim a few bp off 5' end and remove adatpers, don't quality trim
trimmomatic PE -threads $CPU $R1 $R2 $QR1 qtrimR1se.fq.gz $QR2 qtrimR2se.fq.gz ILLUMINACLIP:/usr/local/opt/trimmomatic/share/trimmomatic/adapters/TruSeq3-PE.fa:2:30:10 HEADCROP:8 MINLEN:36
#map to phix genome, remove any paired reads that mapß
wget -c --tries=0 --read-timeout=20 ftp://ftp.ncbi.nlm.nih.gov/genomes/Viruses/Enterobacteria_phage_phiX174_sensu_lato_uid14015/NC_001422.fna
bowtie2-build NC_001422.fna phiX
#map to phiX and recover unmapped paired reads
bowtie2 -x phiX -p $CPU -1 $QR1 -2 $QR2 --un-conc clean.fq > /dev/null 2>&1
#re-compress output PE files
bgzip -@ $CPU clean.1.fq
bgzip -@ $CPU clean.2.fq
fi
if [ ! -f spades.scaffolds.clean2.fasta ]; then
#check for spades folder, run on continue if found
if [ ! -d spades ]; then
#Now run spades
spades.py -k 21,33,55,77,99,127 -t $CPU -m 64 --careful -o spades --pe1-1 $QCR1 --pe1-2 $QCR2
else
spades.py -k 21,33,55,77,99,127 --continue -t $CPU -m 64 --careful -o spades --pe1-1 $QCR1 --pe1-2 $QCR2
fi
#grab scaffolds from output folder
cp spades/scaffolds.fasta $SG1
#mapped reads to assembly using BWA
bwa index $SG1
bwa mem -t $CPU $SG1 $QCR1 $QCR2 | samtools view -@ $CPU -bS - > spades.bam
#resulting assembly was blasted against NCBI nt database
blastn -task megablast -query $SG1 -db $BLASTDB/nt -outfmt '6 qseqid staxids bitscore std sscinames sskingdoms stitle' -culling_limit 5 -num_threads $CPU -evalue 1e-25 -out spades.nt.megablast.out
#run Blobtools to look for contamination
blobtools create -i $SG1 -y spades -t spades.nt.megablast.out -b spades.bam
blobtools view -i blobDB.json
grep -v '^#' blobDB.table.txt > blob.summary.txt
blobtools blobplot -i blobDB.json
#calculate average coverage of largest 50 scaffolds
AVGCOV=$(egrep "$PHYLUM|no-hit" blob.summary.txt | head -n 50 | gawk '{ sum += $7; n++ } END { if (n > 0) print sum / n; }')
MINCOV=$(bc <<< $AVGCOV*0.05)
MINCOV=${MINCOV%.*}
grep -v '^#' blob.summary.txt | gawk -v cov=$MINCOV '{ if($7 < cov) print $1}' > blob.low-coverage.txt
#parse the blob summary output and filter for taxonomy and coverage, get rid of low coverage scaffolds, coverage less than 5% of average
grep -v '^#' blob.summary.txt | egrep -v "$PHYLUM|no-hit" | gawk '{print$1;}' > blob.contamination.txt
#now run the NCBI contamination screen
contamination_screen.sh $SG1 ncbi_contamination.txt
grep -v '^#' ncbi_contamination.txt | cut -f1 > ncbi_remove-list.txt
#combine all lits of scaffolds to remove
cat blob.contamination.txt blob.low-coverage.txt ncbi_remove-list.txt | sort | uniq > all.remove.txt
#remove contaminant scaffolds
fasta_remove.py $SG1 all.remove.txt > spades.scafffolds.clean1.fasta
#now run funannotate clean, dropping scaffolds less than 1 kb
funannotate clean -i spades.scafffolds.clean1.fasta -o spades.scaffolds.clean2.fasta -m 1000
fi
if [ ! -f $OUTPUT.changes.txt ]; then
#Clean up assembly using Pilon, 5 iterations should be sufficient
mkdir -p pilon
cd pilon
cp ../spades.scaffolds.clean2.fasta .
#number of loops is hardcoded to 5 runs of pilon
for i in {1..5}; do
if [[ $i == '1' ]]; then
GENOME=spades.scaffolds.clean2.fasta
else
x=$(($i - 1))
GENOME=pilon$x.fasta
fi
if [[ ! -f pilon$i.fasta ]]; then
echo "Working on $GENOME"
bwa index $GENOME
bwa mem -t $CPU $GENOME ../$QCR1 ../$QCR2 | samtools view -@ $CPU -bS - | samtools sort -@ $CPU -o pilon$i.bam -
samtools index pilon$i.bam
#now run pilon
pilon --genome $GENOME --frags pilon$i.bam --output pilon$i --threads $CPU --changes
else
echo "$GENOME has already been run, moving onto next iteration"
fi
done
cp pilon5.fasta pilon_final.fasta
cp pilon5.changes ../$OUTPUT.changes.txt
cd ..
#sort by size and rename fasta headers
funannotate sort -i pilon/pilon_final.fasta -o $OUTPUT
fi
#do a little cleanup as these files take up a lot of space, you don't need all these pilon BAM files, also remove the intermediate cleanup up fastqs
rm MITO*
rm CONTAM_PROKS*
rm CONTAM_EUKS*
rm -r pilon
rm $QR1
rm $QR2