ChromDiff/plot_sig_majority_args_v2.sh at master · hcharles14/ChromDiff · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/bin/bash
# Copyright 2015 Angela Yen
# This file is part of ChromDiff.
# ChromDiff is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# ChromDiff is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with ChromDiff.  If not, see <http://www.gnu.org/licenses/>.
## After running mk_clust_heatmaps.R to generate first-time dendrogram/pvclust, you can look for dendrogram with heights in the file pvclust_domstate.pdf in the corresponding plots/core/*/perc/wilcox/fdr/ file
## Then pick a height cutoff and re-run plot_sig_majority.sh with the new height cutoff

test_one=false
if [[ "$test_one" == true ]]; then
	testtype="wilcox"
	correction="fdr"
	property="sex"
	group1="Female"
	group2="Male"
	heightcutoff=83
	model="core_gencode_v10_1kb"
	metadatafile="data/final_celltype_metadata.txt"
	genefile="data/gencode_genes_1kb_up.txt"
	covariate_mat_file="data/cov.mat.txt"
	map_covariates_file="data/map_vars_covariates.txt"
	expfile="data/57epigenomes.RPKM.pc"
	state_annotations_file="data/core_annotation.txt"
	generegions_label="gencode_v10_1kb"
else
	property=$1
	group1=$2
	group2=$3
	testtype=$4
	correction=$5
	heightcutoff=$6
	model=$7
	metadatafile=$8
	genefile=${9}
	covariate_mat_file=${10}
	map_covariates_file=${11}
	expfile=${12}
	state_annotations_file=${13}
	generegions_label=${14}
fi


label1=$group1
label2=$group2
metric="perc"

sig_dir="sig_pvals"
sig_file="${sig_dir}/${model}/${metric}.${testtype}.${correction}.${property}.${label1}.${label2}.txt"
if [ ! -s $sig_file ]; then
	echo "$sig_file not found or of size 0. Exiting..."
	exit 0
fi
maj_dir="majority"

echo "Starting combination of ${model} $metric $testtype $correction $label1 and $label2..."

currprefix="${metric}.${testtype}.${correction}.${property}.${label1}.${label2}"


## what genes had at least one significant feature for this combination?
echo "Identifying genes..."
outdir="sig_majority/${model}"
mkdir -p $outdir
outgenefile="${outdir}/${currprefix}.genes.txt"
awk 'BEGIN{FS="_"} { print $1 }' $sig_file | sort -k1,1 | uniq  > $outgenefile

if [ ! -s $outgenefile ]; then
	echo "No significant genes found for $currprefix."
	exit 0
else
	subdir="celltype"
	## find majority state for each "significant" gene ###########
	echo "Processing epigenomes..."
	IFS=$'\t'
	header=true
	while read epid statecallfile rest; do
        if $header ;
        then
                header=false
        else
            #echo "Processing $epid features..."
			if [[ "$metric" == "windows" || "$metric" == "deltawindows" ]]; then
				maj_file="${maj_dir}/${model}/${epid}_majstate_windows.txt"
			else
				maj_file="${maj_dir}/${model}/${epid}_majstate.txt"
			fi
			if [ -s $maj_file ]; then
				mkdir -p "${outdir}/${subdir}"
				awk -v outgenefile=${outgenefile} 'BEGIN{ while(getline < outgenefile) { genes[$1]="TRUE"}}
					{if ($5 in genes){split($6, elts, "_"); found[$5]=elts[1]; delete elts}}
					END{for (geneid in genes) { if (geneid in found) {print geneid, found[geneid];} else {print geneid, "NA"}}} ' ${maj_file} | sort -k1,1 > "${outdir}/${subdir}/${epid}_${currprefix}.txt"
			fi
        fi
	done < $metadatafile

	####### make a single matrix with all the information ########
	echo "Making matrix..."
	matfile="${outdir}/${currprefix}_matrix.txt"
	## Write column names (genes)
	awk 'BEGIN{ printf "celltype\t"} {printf "%s\t", $1 } END{print ""}' $outgenefile > $matfile
	## for each celltype and gene, write the majority state
	IFS=$'\t'
	header=true
	while read epid statecallfile rest; do
        if $header ;
        then
                header=false
        else
			currfile="${outdir}/${subdir}/${epid}_${currprefix}.txt"
			if [ -s $currfile ]; then
				awk -v epid=${epid} 'BEGIN{ printf "%s\t", epid} ($2=="NA"){printf "NA\t";} ($2!="NA"){printf "%i\t", $2 } END{print ""}' $currfile >> $matfile
			fi
        fi
	done < $metadatafile


	#### with data, generate plots #####
	for plottype in "domstate"
	do
		echo "Generating plot type $plottype..."
		echo "Generating majority state heatmap..."
		echo Rscript mk_sig_maj_heatmap.R $metric $testtype $correction $property $group1 $group2 $label1 $label2 ${model} $plottype $metadatafile $genefile $covariate_mat_file $map_covariates_file $expfile $state_annotations_file $generegions_label
		Rscript mk_sig_maj_heatmap.R $metric $testtype $correction $property $group1 $group2 $label1 $label2 ${model} $plottype $metadatafile $genefile $covariate_mat_file $map_covariates_file $expfile $state_annotations_file $generegions_label


		echo "Generating expression heatmap..."
		Rscript mk_exp_heatmap.R $metric $testtype $correction $property $group1 $group2 $label1 $label2 ${model} $plottype $metadatafile $genefile $covariate_mat_file $map_covariates_file $expfile $state_annotations_file $generegions_label


		echo "Generating clustered heatmap..."
		Rscript mk_clust_heatmaps.R $metric $testtype $correction $property $group1 $group2  $label1 $label2 ${model} $heightcutoff ${plottype} $metadatafile $genefile $covariate_mat_file $map_covariates_file $expfile $state_annotations_file $generegions_label

	done
fi