Skip to content

Commit 12587be

Browse files
committed
v2.8
1 parent e463cd1 commit 12587be

15 files changed

+450
-101
lines changed

Scripts/AlignReads.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -100,9 +100,9 @@ def MapAndCount(sample):
100100
svg = config['svg']
101101
AlnOutput = config['AlnOutput']
102102
keepCutReads = config['keepCutReads']
103-
AlnFileSuffix = '_bw2Aln.tsv'
104-
GuideCount_Suffix = '_GuideCounts.tsv'
105-
GeneCount_Suffix = '_GeneCounts.tsv'
103+
AlnFileSuffix = '_bw2Aln.txt'
104+
GuideCount_Suffix = '_GuideCounts.txt'
105+
GeneCount_Suffix = '_GeneCounts.txt'
106106
cutadaptLog = sample+'_cutadapt_log.txt'
107107
logfilename = sample+'_AlignmentResults.txt'
108108

Scripts/AnalyzeControl.py

Lines changed: 24 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
import sys
2121
import time
2222
import yaml
23+
import scipy
24+
from scipy import stats
2325

2426
def EstimateControlCounts():
2527
# ------------------------------------------------
@@ -40,18 +42,18 @@ def EstimateControlCounts():
4042
AlnQCDir = config['AlnQCDir']
4143
ControlDir = config['ControlDir']
4244
res = config['dpi']
43-
thr_overdisp = config['thr_overdisp']
44-
CtrlCounts_Filename = 'Control_GuideCounts_0.tsv'
45+
p_overdisp = config['p_overdisp']
46+
CtrlCounts_Filename = 'Control_GuideCounts_0.txt'
4547

4648
# --------------------------------
4749
# Generate table of control counts
4850
# --------------------------------
4951
print('Reading control counts ...')
5052
os.chdir(AlnQCDir)
51-
ControlSamples = [d for d in os.listdir(AlnQCDir) if 'Control' in d]
53+
ControlSamples = [d for d in os.listdir(AlnQCDir) if 'Control' in d and 'Control_avg' not in d]
5254
os.chdir(ControlSamples[0])
5355
colnames = ['sgID','gene','counts']
54-
CountFile = pd.read_table(glob.glob('*GuideCounts_0.tsv')[0], sep='\t',names=colnames)
56+
CountFile = pd.read_table(glob.glob('*GuideCounts_0.txt')[0], sep='\t',names=colnames)
5557
sgIDs = list(CountFile['sgID'].values)
5658
genes = list(CountFile['gene'].values)
5759
L = len(sgIDs)
@@ -64,7 +66,7 @@ def EstimateControlCounts():
6466
os.chdir(AlnQCDir)
6567
for controlsample in ControlSamples:
6668
os.chdir(controlsample)
67-
filename = glob.glob('*GuideCounts_0.tsv')[0]
69+
filename = glob.glob('*GuideCounts_0.txt')[0]
6870
CountFile = pd.read_table(filename, sep='\t',names=colnames)
6971
counts = list(CountFile['counts'].values)
7072
CtrlCounts_df[controlsample] = counts
@@ -85,22 +87,26 @@ def EstimateControlCounts():
8587

8688
# --------------------------------------------------------------
8789
# Determine if the variance equals the mean (Poisson distribution)
88-
# --------------------------------------------------------------
89-
Svar0 = numpy.mean(SampleVar)
90-
if Svar0 == 0:
90+
# --------------------------------------------------------------
91+
if len(ControlSamples) == 1:
9192
Model = 'none'
92-
print('WARNING: Zero variance or no control replicates! Cannot choose statistical model.')
93+
print('WARNING: No control replicates! Cannot choose statistical model.')
9394
else:
94-
L0_list = [1 if Mean[k]>0 else 0 for k in range(L)]
95-
overdisp_list = [1 if Mean[k]>0 and SampleVar[k]>Mean[k] else 0 for k in range(L)]
96-
overdisp = sum(overdisp_list)/sum(L0_list)
97-
print('Overdispersion fraction: '+str(overdisp))
98-
if overdisp >= thr_overdisp:
95+
I = [i for i in range(L) if Mean[i]>0]
96+
Mean0 = [Mean[i] for i in I]
97+
Var0 = [SampleVar[i] for i in I]
98+
TestStat = scipy.stats.mannwhitneyu(Var0,Mean0,alternative='two-sided')
99+
if TestStat[1] >= p_overdisp:
100+
Model = 'Poisson'
101+
print('No overdispersion detected at p='+str(p_overdisp)+'. Choosing Poisson model ...')
102+
TestStat = scipy.stats.mannwhitneyu(Var0,Mean0,alternative='greater')
103+
if TestStat[1] < p_overdisp:
99104
Model = 'Neg. Binomial'
100-
print('Choosing negative binomial model ...')
105+
print('Overdispersion detected at p='+str(TestStat[1])+'. Choosing negative binomial model ...')
101106
else:
102-
Model = 'Poisson'
103-
print('Choosing Poisson model ...')
107+
Model = 'none'
108+
print('WARNING: Low variance in control samples! Cannot choose statistical model ...')
109+
104110

105111
# -----------------------------------------------
106112
# Model variance
@@ -120,7 +126,7 @@ def EstimateControlCounts():
120126
# Compute parameters for neg. binom. distribution
121127
# n: number of failures, p: probability of failure
122128
# -----------------------------------------------
123-
print('Computing parameters of negative binomial distribution ...')
129+
print('Computing distribution parameters ...')
124130
n = list(); p = list()
125131
for k in range(L):
126132
if Mean[k]==0 or Var[k]==0 :

Scripts/AnalyzeReadCounts.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,12 @@ def AnalyzeCounts(sample):
6060
# --------------------------------------
6161
os.chdir(InputDir)
6262
colnames = ['ID','gene','counts']
63-
GuideFileName = glob.glob('*_GuideCounts_0.tsv')[0]
63+
GuideFileName = glob.glob('*_GuideCounts_0.txt')[0]
6464
GuideFile = pd.read_table(GuideFileName, sep='\t', names=colnames)
6565
ReadsPerGuide = list(GuideFile['counts'].values)
6666
L = len(ReadsPerGuide)
6767
colnames = ['gene','counts']
68-
GeneFileName = glob.glob('*_GeneCounts_0.tsv')[0]
68+
GeneFileName = glob.glob('*_GeneCounts_0.txt')[0]
6969
GeneFile = pd.read_table(GeneFileName, sep='\t', names=colnames)
7070
ReadsPerGene = list(GeneFile['counts'].values)
7171
sgID = list(GuideFile['ID'].values)
@@ -242,13 +242,13 @@ def AnalyzeCounts(sample):
242242
sec_elapsed = end_total - start_total
243243
if sec_elapsed < 60:
244244
time_elapsed = sec_elapsed
245-
print('Time elapsed (Total) [secs]: ' + '%.3f' % time_elapsed +'\n')
245+
print('Time elapsed [secs]: ' + '%.3f' % time_elapsed +'\n')
246246
elif sec_elapsed < 3600:
247247
time_elapsed = sec_elapsed/60
248-
print('Time elapsed (Total) [mins]: ' + '%.3f' % time_elapsed +'\n')
248+
print('Time elapsed [mins]: ' + '%.3f' % time_elapsed +'\n')
249249
else:
250250
time_elapsed = sec_elapsed/3600
251-
print('Time elapsed (Total) [hours]: ' + '%.3f' % time_elapsed +'\n')
251+
print('Time elapsed [hours]: ' + '%.3f' % time_elapsed +'\n')
252252

253253

254254
if __name__ == "__main__":

Scripts/AverageCounts.py

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
#!/usr/bin/python
2+
# -*- coding: utf-8 -*-
3+
"""
4+
Created on Fri Oct 13 17:49:59 2017
5+
6+
@author: philipp
7+
"""
8+
9+
# Average counts across replicates
10+
# =======================================================================
11+
# Imports
12+
from __future__ import division # floating point division by default
13+
import pandas
14+
import numpy
15+
import os
16+
import glob
17+
import time
18+
import yaml
19+
import sys
20+
21+
22+
def AverageReadCounts(treatment):
23+
# ------------------------------------------------
24+
# Get parameters
25+
# ------------------------------------------------
26+
configFile = open('configuration.yaml','r')
27+
config = yaml.load(configFile)
28+
configFile.close()
29+
ScriptsDir = config['ScriptsDir']
30+
AlnQCDir = config['AlnQCDir']
31+
repl_avg = config['repl_avg']
32+
AvgDir = treatment+'_avg'
33+
34+
# ------------------------------------------------
35+
# Get counts from each replicate
36+
# ------------------------------------------------
37+
start = time.time()
38+
print('Processing '+treatment+' ...')
39+
colnames_s = ['sgRNA','gene','counts']
40+
colnames_g = ['gene','counts']
41+
os.chdir(AlnQCDir)
42+
ReplDirs = [d for d in os.listdir(AlnQCDir) if treatment in d and '_avg' not in d]
43+
R = len(ReplDirs)
44+
if R >= 2:
45+
print('Averaging read counts over '+str(R)+' replicates ...')
46+
AllGuideCounts = pandas.DataFrame()
47+
AllGuideCounts0 = pandas.DataFrame()
48+
AllGeneCounts = pandas.DataFrame()
49+
AllGeneCounts0 = pandas.DataFrame()
50+
for replicate in ReplDirs:
51+
os.chdir(replicate)
52+
# sgRNA counts
53+
filename = glob.glob('*GuideCounts.txt')[0]
54+
CountsFile = pandas.read_table(filename, sep='\t',names=colnames_s)
55+
CountsFile = CountsFile.sort_values(['gene','sgRNA'])
56+
sgIDs = list(CountsFile['sgRNA'])
57+
genes = list(CountsFile['gene'])
58+
counts = list(CountsFile['counts'])
59+
AllGuideCounts['sgRNA'] = sgIDs
60+
AllGuideCounts['gene'] = genes
61+
AllGuideCounts[replicate] = counts
62+
# normalized sgRNA counts
63+
filename = glob.glob('*GuideCounts_0.txt')[0]
64+
CountsFile = pandas.read_table(filename, sep='\t',names=colnames_s)
65+
CountsFile = CountsFile.sort_values(['gene','sgRNA'])
66+
sgIDs = list(CountsFile['sgRNA'])
67+
genes = list(CountsFile['gene'])
68+
counts = list(CountsFile['counts'])
69+
AllGuideCounts0['sgRNA'] = sgIDs
70+
AllGuideCounts0['gene'] = genes
71+
AllGuideCounts0[replicate] = counts
72+
# gene counts
73+
filename = glob.glob('*GeneCounts.txt')[0]
74+
CountsFile = pandas.read_table(filename, sep='\t',names=colnames_g)
75+
CountsFile = CountsFile.sort_values(['gene'])
76+
genes = list(CountsFile['gene'])
77+
counts = list(CountsFile['counts'])
78+
AllGeneCounts['gene'] = genes
79+
AllGeneCounts[replicate] = counts
80+
# normalized gene counts
81+
filename = glob.glob('*GeneCounts_0.txt')[0]
82+
CountsFile = pandas.read_table(filename, sep='\t',names=colnames_g)
83+
CountsFile = CountsFile.sort_values(['gene'])
84+
genes = list(CountsFile['gene'])
85+
counts = list(CountsFile['counts'])
86+
AllGeneCounts0['gene'] = genes
87+
AllGeneCounts0[replicate] = counts
88+
os.chdir(AlnQCDir)
89+
# ------------------------------------------------
90+
# Compute averages
91+
# ------------------------------------------------
92+
# sgRNA counts
93+
repl_counts = AllGuideCounts.iloc[:,2:]
94+
if repl_avg == 'median':
95+
avg_counts = repl_counts.median(axis=1)
96+
elif repl_avg == 'mean':
97+
avg_counts = repl_counts.mean(axis=1)
98+
AllGuideCounts[treatment+'_avg'] = avg_counts
99+
del_columns = range(2,2+R)
100+
AllGuideCounts.drop(AllGuideCounts.columns[del_columns],axis=1,inplace=True)
101+
# normalized sgRNA counts
102+
repl_counts = AllGuideCounts0.iloc[:,2:]
103+
if repl_avg == 'median':
104+
avg_counts = repl_counts.median(axis=1)
105+
elif repl_avg == 'mean':
106+
avg_counts = repl_counts.mean(axis=1)
107+
AllGuideCounts0[treatment+'_avg'] = avg_counts
108+
del_columns = range(2,2+R)
109+
AllGuideCounts0.drop(AllGuideCounts0.columns[del_columns],axis=1,inplace=True)
110+
# gene counts
111+
repl_counts = AllGeneCounts.iloc[:,1:]
112+
if repl_avg == 'median':
113+
avg_counts = repl_counts.median(axis=1)
114+
elif repl_avg == 'mean':
115+
avg_counts = repl_counts.mean(axis=1)
116+
AllGeneCounts[treatment+'_avg'] = avg_counts
117+
del_columns = range(1,1+R)
118+
AllGeneCounts.drop(AllGeneCounts.columns[del_columns],axis=1,inplace=True)
119+
# normalized gene counts
120+
repl_counts = AllGeneCounts0.iloc[:,1:]
121+
if repl_avg == 'median':
122+
avg_counts = repl_counts.median(axis=1)
123+
elif repl_avg == 'mean':
124+
avg_counts = repl_counts.mean(axis=1)
125+
AllGeneCounts0[treatment+'_avg'] = avg_counts
126+
del_columns = range(1,1+R)
127+
AllGeneCounts0.drop(AllGeneCounts0.columns[del_columns],axis=1,inplace=True)
128+
# ------------------------------------------------
129+
# Write result dataframes
130+
# ------------------------------------------------
131+
os.chdir(AlnQCDir)
132+
if not os.path.exists(AvgDir):
133+
os.makedirs(AvgDir)
134+
os.chdir(AvgDir)
135+
AllGuideCounts.to_csv(treatment+'_avg_GuideCounts.txt', sep = '\t', index = False, header = False)
136+
AllGuideCounts0.to_csv(treatment+'_avg_GuideCounts_0.txt', sep = '\t', index = False, header = False)
137+
AllGeneCounts.to_csv(treatment+'_avg_GeneCounts.txt', sep = '\t', index = False, header = False)
138+
AllGeneCounts0.to_csv(treatment+'_avg_GeneCounts_0.txt', sep = '\t', index = False, header = False)
139+
else:
140+
print('(No replicates found)')
141+
142+
143+
# --------------------------------------
144+
# Time stamp
145+
# --------------------------------------
146+
os.chdir(ScriptsDir)
147+
end = time.time()
148+
# Final time stamp
149+
sec_elapsed = end - start
150+
if sec_elapsed < 60:
151+
time_elapsed = sec_elapsed
152+
print('Time elapsed [secs]: ' + '%.3f' % time_elapsed)
153+
elif sec_elapsed < 3600:
154+
time_elapsed = sec_elapsed/60
155+
print('Time elapsed [mins]: ' + '%.3f' % time_elapsed)
156+
else:
157+
time_elapsed = sec_elapsed/3600
158+
print('Time elapsed [hours]: ' + '%.3f' % time_elapsed)
159+
160+
if __name__ == "__main__":
161+
input1 = sys.argv[1]
162+
AverageReadCounts(input1)

Scripts/CheckSequenceQuality.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,10 @@ def RunSeqQC():
2929
os.chdir(DataDir)
3030
FileNames = [d for d in os.listdir(DataDir)]
3131
for filename in FileNames:
32-
os.system('fastqc -o '+SeqQCDir+' --extract '+filename)
32+
if filename[-8:] == 'fastq.gz':
33+
os.system('fastqc -o '+SeqQCDir+' --extract '+filename)
34+
elif filename[-5:] == 'fastq':
35+
os.system('fastqc -o '+SeqQCDir+' '+filename)
3336
os.chdir(SeqQCDir)
3437
os.system('rm *.zip')
3538
os.chdir(ScriptsDir)

0 commit comments

Comments
 (0)