Skip to content

Commit ccfee65

Browse files
committed
v2.8.1
1 parent 9722462 commit ccfee65

File tree

5 files changed

+83
-66
lines changed

5 files changed

+83
-66
lines changed

Scripts/AnalyzeControl.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -95,17 +95,28 @@ def EstimateControlCounts():
9595
I = [i for i in range(L) if Mean[i]>0]
9696
Mean0 = [Mean[i] for i in I]
9797
Var0 = [SampleVar[i] for i in I]
98-
TestStat = scipy.stats.mannwhitneyu(Var0,Mean0,alternative='two-sided')
98+
TestStat = scipy.stats.wilcoxon(Var0,Mean0)
9999
if TestStat[1] >= p_overdisp:
100100
Model = 'Poisson'
101-
print('No overdispersion detected at p='+str(p_overdisp)+'. Choosing Poisson model ...')
102-
TestStat = scipy.stats.mannwhitneyu(Var0,Mean0,alternative='greater')
103-
if TestStat[1] < p_overdisp:
104-
Model = 'Neg. Binomial'
105-
print('Overdispersion detected at p='+str(TestStat[1])+'. Choosing negative binomial model ...')
101+
print('Cannot reject equality of read count mean and variance (p='+str(p_overdisp)+'). Choosing Poisson model ...')
106102
else:
107-
Model = 'none'
108-
print('WARNING: Low variance in control samples! Cannot choose statistical model ...')
103+
# compute rank sums manually (** scipy does not allow one-sided Wilcoxon tests **)
104+
I = [i for i in range(len(Mean0)) if Var0[i]!=Mean0[i]]
105+
Mean00 = [Mean0[i] for i in I]
106+
Var00 = [Var0[i] for i in I]
107+
Delta = [numpy.abs(Var00[i]-Mean00[i]) for i in range(len(Mean00))]
108+
sig = [1 if Var00[i]>Mean00[i] else -1 for i in range(len(Mean00))]
109+
Ranks = scipy.stats.mstats.rankdata(Delta)
110+
Ranks_pos = [Ranks[i] for i in range(len(Mean00)) if sig[i]>0]
111+
Ranks_neg = [Ranks[i] for i in range(len(Mean00)) if sig[i]<0]
112+
W_pos = sum(Ranks_pos)
113+
W_neg = sum(Ranks_neg)
114+
if W_pos > W_neg:
115+
Model = 'Neg. Binomial'
116+
print('Overdispersion detected at p='+str(TestStat[1])+'. Choosing negative binomial model ...')
117+
else:
118+
Model = 'Neg. Binomial' # for lack of better choice...
119+
print('WARNING: Low variance in control samples (underdispersion)!')
109120

110121

111122
# -----------------------------------------------

Scripts/BuildLibraryIndex.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,12 +47,12 @@ def BuildBowtieIndex():
4747
os.chdir(LibDir)
4848
LibCols = ['gene','ID','seq']
4949
LibFile = pd.read_table(LibFilename, sep = libsep, skiprows = 1, names = LibCols)
50-
seq = LibFile['seq'].values
51-
IDs = LibFile['ID'].values
50+
seq = list(LibFile['seq'])
51+
IDs = list(LibFile['ID'])
5252
with open('library.fasta','w') as library_fasta:
5353
for k in range(len(IDs)):
54-
library_fasta.write('>'+IDs[k]+'\n')
55-
library_fasta.write(seq[k]+'\n')
54+
library_fasta.write('>'+str(IDs[k])+'\n')
55+
library_fasta.write(str(seq[k])+'\n')
5656
library_fasta.close()
5757

5858
# ----------------------------------

Scripts/CombineGeneRanks.py

Lines changed: 49 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -37,56 +37,57 @@ def GeneRankCombination(treatment):
3737
os.chdir(GeneDir)
3838
treatment_files = [f for f in os.listdir(GeneDir) if treatment in f\
3939
and metric in f and 'combined' not in f]
40-
treatment_files.sort()
41-
K = len(treatment_files)
42-
ResultTable = pandas.DataFrame()
43-
X1 = pandas.read_table(treatment_files[0], sep='\t')
44-
# Pre-process gene rank tables in case of STARS
45-
if metric == 'STARS':
46-
# Compute consensus gene list (present in all replicates)
47-
print('Computing consensus gene list from STARS output ...')
48-
Genes_0 = set(X1['gene'])
49-
for treatment_file in treatment_files:
50-
X = pandas.read_table(treatment_file, sep='\t')
51-
Genes = set(X['gene'])
52-
Genes_0 = Genes_0.intersection(Genes)
53-
G = len(Genes_0)
54-
else:
55-
G = len(X1)
56-
# Read replicates
57-
chi = list(numpy.zeros(G))
58-
k = 0
59-
for treatment_file in treatment_files:
60-
k+=1
61-
print('Reading '+treatment+' replicate '+str(k)+' ...')
62-
X = pandas.read_table(treatment_file, sep='\t')
40+
if len(treatment_files) > 1:
41+
treatment_files.sort()
42+
K = len(treatment_files)
43+
ResultTable = pandas.DataFrame()
44+
X1 = pandas.read_table(treatment_files[0], sep='\t')
45+
# Pre-process gene rank tables in case of STARS
6346
if metric == 'STARS':
64-
# use only genes from consensus list
65-
I = [X[X['gene']==gene].index[0] for gene in Genes_0]
66-
X0 = X.iloc[I]
67-
X0.sort_values('gene',ascending=1)
47+
# Compute consensus gene list (present in all replicates)
48+
print('Computing consensus gene list from STARS output ...')
49+
Genes_0 = set(X1['gene'])
50+
for treatment_file in treatment_files:
51+
X = pandas.read_table(treatment_file, sep='\t')
52+
Genes = set(X['gene'])
53+
Genes_0 = Genes_0.intersection(Genes)
54+
G = len(Genes_0)
6855
else:
69-
X0 = X.sort_values('gene',ascending=1)
70-
genes = list(X0['gene'])
71-
ResultTable['gene'] = genes
72-
pval = list(X0['p_value (adj.)'])
73-
ResultTable['p-value Repl. '+str(k)] = pval
74-
ln_pval = [numpy.log(pval[i]+eps) for i in range(G)]
75-
chi = numpy.add(chi,ln_pval)
76-
77-
# Combine p-values
78-
print('Computing Fisher statistic ...')
79-
chi = [-2*chi[i] for i in range(G)]
80-
ResultTable['Fisher Statistic'] = chi
81-
PVal = [1 - scipy.stats.chi2.cdf(chi[i],2*K) for i in range(G)]
82-
ResultTable['p-value combined'] = PVal
83-
significant = [PVal[i] < alpha for i in range(G)]
84-
ResultTable['significant'] = significant
85-
ResultTable = ResultTable.sort_values(['significant','p-value combined'],ascending=[0,1])
86-
print('Writing results dataframe ...')
87-
ResultFilename = treatment+'_combined_'+str(alpha)+'_'+str(padj)+'_'+str(metric)\
88-
+'_P'+str(Np)+'_GeneList.txt'
89-
ResultTable.to_csv(ResultFilename, sep = '\t', index = False)
56+
G = len(X1)
57+
# Read replicates
58+
chi = list(numpy.zeros(G))
59+
k = 0
60+
for treatment_file in treatment_files:
61+
k+=1
62+
print('Reading '+treatment+' replicate '+str(k)+' ...')
63+
X = pandas.read_table(treatment_file, sep='\t')
64+
if metric == 'STARS':
65+
# use only genes from consensus list
66+
I = [X[X['gene']==gene].index[0] for gene in Genes_0]
67+
X0 = X.iloc[I]
68+
X0.sort_values('gene',ascending=1)
69+
else:
70+
X0 = X.sort_values('gene',ascending=1)
71+
genes = list(X0['gene'])
72+
ResultTable['gene'] = genes
73+
pval = list(X0['p_value (adj.)'])
74+
ResultTable['p-value Repl. '+str(k)] = pval
75+
ln_pval = [numpy.log(pval[i]+eps) for i in range(G)]
76+
chi = numpy.add(chi,ln_pval)
77+
78+
# Combine p-values
79+
print('Computing Fisher statistic ...')
80+
chi = [-2*chi[i] for i in range(G)]
81+
ResultTable['Fisher Statistic'] = chi
82+
PVal = [1 - scipy.stats.chi2.cdf(chi[i],2*K) for i in range(G)]
83+
ResultTable['p-value combined'] = PVal
84+
significant = [PVal[i] < alpha for i in range(G)]
85+
ResultTable['significant'] = significant
86+
ResultTable = ResultTable.sort_values(['significant','p-value combined'],ascending=[0,1])
87+
print('Writing results dataframe ...')
88+
ResultFilename = treatment+'_combined_'+str(alpha)+'_'+str(padj)+'_'+str(metric)\
89+
+'_P'+str(Np)+'_GeneList.txt'
90+
ResultTable.to_csv(ResultFilename, sep = '\t', index = False)
9091

9192
# Time stamp
9293
end = time.time()

Scripts/PrintStatus.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
def PrintStatus_Header():
1414
print('**************************************************')
15-
print('Launching PinAPL-Py v2.8..')
15+
print('Launching PinAPL-Py v2.8.1')
1616
print('P. Spahn et al., UC San Diego (11/2017)')
1717
print('**************************************************')
1818

@@ -73,4 +73,4 @@ def PrintStatus_TimeStamp(msg):
7373
elif input1 == 'AllDone':
7474
PrintStatus_AllDone()
7575
elif input1 == 'TimeStamp':
76-
PrintStatus_TimeStamp(input2)
76+
PrintStatus_TimeStamp(input2)

Scripts/RankGenes.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,7 @@ def GeneRankingAnalysis(sample):
239239
global lfc; lfc = list(lfc_DF['lfc'])
240240
parjob = Parallel(n_jobs=num_cores)(delayed(AverageLogFC)(g) for g in range(G))
241241
nGuides = [parjob[g][0] for g in range(G)]
242-
AvgLogFCs = [parjob[g][1] for g in range(G)]
242+
AvgLogFC = [parjob[g][1] for g in range(G)]
243243

244244

245245
# -------------------------------------------
@@ -289,7 +289,7 @@ def GeneRankingAnalysis(sample):
289289
metric_sig = multTest[0]
290290
metric_pval0 = multTest[1]
291291
else: # no control replicates
292-
print('### ERROR: Cannot compute aRRA scores without control replicates! ###')
292+
print('### ERROR: Cannot compute aRRA scores without significant sgRNAs! ###')
293293
SortFlag = True
294294
metric = [-1 for k in range(G)]
295295
metric_pval = [-1 for k in range(G)]
@@ -334,11 +334,16 @@ def GeneRankingAnalysis(sample):
334334
STARS_output = glob.glob('counts_STARSOutput*.txt')[0]
335335
STARS = pandas.read_table(STARS_output, sep='\t')
336336
geneList_s = list(STARS['Gene Symbol'].values)
337+
# Reduce gene list to genes reported by STARS
337338
G = len(geneList_s)
338339
s_index = [geneList.index(geneList_s[k]) for k in range(G)]
339340
geneList = geneList_s
340341
sigGuides_s = [sigGuides[s_index[k]] for k in range(G)]
341342
sigGuides = sigGuides_s
343+
AvgLogFC_s = [AvgLogFC[s_index[k]] for k in range(G)]
344+
AvgLogFC = AvgLogFC_s
345+
nGuides_s = [nGuides[s_index[k]] for k in range(G)]
346+
nGuides = nGuides_s
342347
metric = list(STARS['STARS Score'].values)
343348
metric_pval = list(STARS['p-value'].values)
344349
multTest = multipletests(metric_pval,alpha,padj)
@@ -353,7 +358,7 @@ def GeneRankingAnalysis(sample):
353358
# Run non-parametric permutation analysis
354359
# -------------------------------------------------
355360
SortFlag = False
356-
metric = AvgLogFCs
361+
metric = AvgLogFC
357362
# Compute permutations
358363
I_perm = numpy.random.choice(L,size=(Np,r),replace=False)
359364
metric_null = Parallel(n_jobs=num_cores)(delayed(AvgLogFC_null)(I) for I in I_perm)
@@ -393,7 +398,7 @@ def GeneRankingAnalysis(sample):
393398
'significant': [str(metric_sig[g]) for g in range(G)],
394399
'# sgRNAs': [nGuides[g] for g in range(G)],
395400
'# signif. sgRNAs': [sigGuides[g] for g in range(G)],
396-
'avg. logFC': [AvgLogFCs[g] for g in range(G)]},
401+
'avg. logFC': [AvgLogFC[g] for g in range(G)]},
397402
columns = ['gene',GeneMetric,'p_value','p_value (adj.)',\
398403
'significant','# sgRNAs','# signif. sgRNAs','avg. logFC'])
399404
if GeneMetric == 'AVGLFC':

0 commit comments

Comments
 (0)