LewisLabUCSD
diff --git a/‎Scripts/AlignReads.py‎
Lines changed: 3 additions & 3 deletions b/‎Scripts/AlignReads.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎Scripts/AnalyzeControl.py‎
Lines changed: 24 additions & 18 deletions b/‎Scripts/AnalyzeControl.py‎
Lines changed: 24 additions & 18 deletions
diff --git a/‎Scripts/AnalyzeReadCounts.py‎
Lines changed: 5 additions & 5 deletions b/‎Scripts/AnalyzeReadCounts.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎Scripts/AverageCounts.py‎
Lines changed: 162 additions & 0 deletions b/‎Scripts/AverageCounts.py‎
Lines changed: 162 additions & 0 deletions
diff --git a/‎Scripts/CheckSequenceQuality.py‎
Lines changed: 4 additions & 1 deletion b/‎Scripts/CheckSequenceQuality.py‎
Lines changed: 4 additions & 1 deletion
@@ -100,9 +100,9 @@ def MapAndCount(sample):
     svg = config['svg']
     AlnOutput = config['AlnOutput']
     keepCutReads = config['keepCutReads']
-    AlnFileSuffix = '_bw2Aln.tsv'
-    GuideCount_Suffix = '_GuideCounts.tsv'
-    GeneCount_Suffix = '_GeneCounts.tsv'
+    AlnFileSuffix = '_bw2Aln.txt'
+    GuideCount_Suffix = '_GuideCounts.txt'
+    GeneCount_Suffix = '_GeneCounts.txt'
     cutadaptLog = sample+'_cutadapt_log.txt'
     logfilename = sample+'_AlignmentResults.txt'
 
 
@@ -20,6 +20,8 @@
 import sys
 import time
 import yaml
+import scipy
+from scipy import stats
 
 def EstimateControlCounts(): 
     # ------------------------------------------------
@@ -40,18 +42,18 @@ def EstimateControlCounts():
     AlnQCDir = config['AlnQCDir']
     ControlDir = config['ControlDir']
     res = config['dpi']
-    thr_overdisp = config['thr_overdisp']
-    CtrlCounts_Filename = 'Control_GuideCounts_0.tsv'
+    p_overdisp = config['p_overdisp']
+    CtrlCounts_Filename = 'Control_GuideCounts_0.txt'
 
     # --------------------------------    
     # Generate table of control counts
     # --------------------------------    
     print('Reading control counts ...')    
     os.chdir(AlnQCDir)
-    ControlSamples = [d for d in os.listdir(AlnQCDir) if 'Control' in d]
+    ControlSamples = [d for d in os.listdir(AlnQCDir) if 'Control' in d and 'Control_avg' not in d]
     os.chdir(ControlSamples[0])
     colnames = ['sgID','gene','counts']                      
-    CountFile = pd.read_table(glob.glob('*GuideCounts_0.tsv')[0], sep='\t',names=colnames)
+    CountFile = pd.read_table(glob.glob('*GuideCounts_0.txt')[0], sep='\t',names=colnames)
     sgIDs = list(CountFile['sgID'].values)
     genes = list(CountFile['gene'].values)
     L = len(sgIDs)
@@ -64,7 +66,7 @@ def EstimateControlCounts():
         os.chdir(AlnQCDir)
         for controlsample in ControlSamples:
             os.chdir(controlsample)
-            filename = glob.glob('*GuideCounts_0.tsv')[0]                          
+            filename = glob.glob('*GuideCounts_0.txt')[0]                          
             CountFile = pd.read_table(filename, sep='\t',names=colnames)
             counts = list(CountFile['counts'].values)
             CtrlCounts_df[controlsample] = counts
@@ -85,22 +87,26 @@ def EstimateControlCounts():
 
     # --------------------------------------------------------------    
     # Determine if the variance equals the mean (Poisson distribution)
-    # --------------------------------------------------------------      
-    Svar0 = numpy.mean(SampleVar)
-    if Svar0 == 0:
+    # --------------------------------------------------------------       
+    if len(ControlSamples) == 1:
         Model = 'none'
-        print('WARNING: Zero variance or no control replicates! Cannot choose statistical model.')
+        print('WARNING: No control replicates! Cannot choose statistical model.')
     else:
-        L0_list = [1 if Mean[k]>0 else 0 for k in range(L)]
-        overdisp_list = [1 if Mean[k]>0 and SampleVar[k]>Mean[k] else 0 for k in range(L)]
-        overdisp = sum(overdisp_list)/sum(L0_list)
-        print('Overdispersion fraction: '+str(overdisp))
-        if overdisp >= thr_overdisp:
+        I = [i for i in range(L) if Mean[i]>0]        
+        Mean0 = [Mean[i] for i in I]
+        Var0 = [SampleVar[i] for i in I]
+        TestStat = scipy.stats.mannwhitneyu(Var0,Mean0,alternative='two-sided')
+        if TestStat[1] >= p_overdisp:
+            Model = 'Poisson'
+            print('No overdispersion detected at p='+str(p_overdisp)+'. Choosing Poisson model ...')
+        TestStat = scipy.stats.mannwhitneyu(Var0,Mean0,alternative='greater')
+        if TestStat[1] < p_overdisp:
             Model = 'Neg. Binomial'
-            print('Choosing negative binomial model ...')
+            print('Overdispersion detected at p='+str(TestStat[1])+'. Choosing negative binomial model ...')
         else:
-            Model = 'Poisson'
-            print('Choosing Poisson model ...')
+            Model = 'none'
+            print('WARNING: Low variance in control samples! Cannot choose statistical model ...')            
+
 
     # -----------------------------------------------    
     # Model variance
@@ -120,7 +126,7 @@ def EstimateControlCounts():
         # Compute parameters for neg. binom. distribution 
         # n: number of failures, p: probability of failure
         # -----------------------------------------------
-        print('Computing parameters of negative binomial distribution ...')
+        print('Computing distribution parameters ...')
         n = list(); p = list()
         for k in range(L):
             if Mean[k]==0 or Var[k]==0 :
 
@@ -60,12 +60,12 @@ def AnalyzeCounts(sample):
     # --------------------------------------
     os.chdir(InputDir)
     colnames = ['ID','gene','counts']
-    GuideFileName = glob.glob('*_GuideCounts_0.tsv')[0]
+    GuideFileName = glob.glob('*_GuideCounts_0.txt')[0]
     GuideFile = pd.read_table(GuideFileName, sep='\t', names=colnames)
     ReadsPerGuide = list(GuideFile['counts'].values)
     L = len(ReadsPerGuide)   
     colnames = ['gene','counts']    
-    GeneFileName = glob.glob('*_GeneCounts_0.tsv')[0]
+    GeneFileName = glob.glob('*_GeneCounts_0.txt')[0]
     GeneFile = pd.read_table(GeneFileName, sep='\t', names=colnames)
     ReadsPerGene = list(GeneFile['counts'].values)
     sgID = list(GuideFile['ID'].values)    
@@ -242,13 +242,13 @@ def AnalyzeCounts(sample):
     sec_elapsed = end_total - start_total
     if sec_elapsed < 60:
         time_elapsed = sec_elapsed
-        print('Time elapsed (Total) [secs]: ' + '%.3f' % time_elapsed +'\n')
+        print('Time elapsed [secs]: ' + '%.3f' % time_elapsed +'\n')
     elif sec_elapsed < 3600:
         time_elapsed = sec_elapsed/60
-        print('Time elapsed (Total) [mins]: ' + '%.3f' % time_elapsed +'\n')
+        print('Time elapsed [mins]: ' + '%.3f' % time_elapsed +'\n')
     else:
         time_elapsed = sec_elapsed/3600
-        print('Time elapsed (Total) [hours]: ' + '%.3f' % time_elapsed +'\n')            
+        print('Time elapsed [hours]: ' + '%.3f' % time_elapsed +'\n')            
 
 
 if __name__ == "__main__":
 
@@ -0,0 +1,162 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Oct 13 17:49:59 2017
+
+@author: philipp
+"""
+
+# Average counts across replicates
+# =======================================================================
+# Imports
+from __future__ import division # floating point division by default
+import pandas
+import numpy
+import os
+import glob
+import time
+import yaml
+import sys
+
+
+def AverageReadCounts(treatment):
+    # ------------------------------------------------
+    # Get parameters
+    # ------------------------------------------------
+    configFile = open('configuration.yaml','r')
+    config = yaml.load(configFile)
+    configFile.close()
+    ScriptsDir = config['ScriptsDir']
+    AlnQCDir = config['AlnQCDir']
+    repl_avg = config['repl_avg']
+    AvgDir = treatment+'_avg'
+    
+    # ------------------------------------------------
+    # Get counts from each replicate 
+    # ------------------------------------------------ 
+    start = time.time()
+    print('Processing '+treatment+' ...')
+    colnames_s = ['sgRNA','gene','counts']
+    colnames_g = ['gene','counts']    
+    os.chdir(AlnQCDir)
+    ReplDirs = [d for d in os.listdir(AlnQCDir) if treatment in d and '_avg' not in d]
+    R = len(ReplDirs)
+    if R >= 2:
+        print('Averaging read counts over '+str(R)+' replicates ...')
+        AllGuideCounts = pandas.DataFrame()  
+        AllGuideCounts0 = pandas.DataFrame()
+        AllGeneCounts = pandas.DataFrame()
+        AllGeneCounts0 = pandas.DataFrame()    
+        for replicate in ReplDirs:
+            os.chdir(replicate)
+            # sgRNA counts
+            filename = glob.glob('*GuideCounts.txt')[0]
+            CountsFile = pandas.read_table(filename, sep='\t',names=colnames_s)
+            CountsFile = CountsFile.sort_values(['gene','sgRNA'])
+            sgIDs = list(CountsFile['sgRNA'])        
+            genes = list(CountsFile['gene'])                
+            counts = list(CountsFile['counts'])    
+            AllGuideCounts['sgRNA'] = sgIDs 
+            AllGuideCounts['gene'] = genes         
+            AllGuideCounts[replicate] = counts
+            # normalized sgRNA counts
+            filename = glob.glob('*GuideCounts_0.txt')[0]
+            CountsFile = pandas.read_table(filename, sep='\t',names=colnames_s)
+            CountsFile = CountsFile.sort_values(['gene','sgRNA'])        
+            sgIDs = list(CountsFile['sgRNA'])        
+            genes = list(CountsFile['gene'])                
+            counts = list(CountsFile['counts'])    
+            AllGuideCounts0['sgRNA'] = sgIDs 
+            AllGuideCounts0['gene'] = genes         
+            AllGuideCounts0[replicate] = counts        
+            # gene counts
+            filename = glob.glob('*GeneCounts.txt')[0]
+            CountsFile = pandas.read_table(filename, sep='\t',names=colnames_g)
+            CountsFile = CountsFile.sort_values(['gene'])
+            genes = list(CountsFile['gene'])                
+            counts = list(CountsFile['counts'])    
+            AllGeneCounts['gene'] = genes         
+            AllGeneCounts[replicate] = counts        
+            # normalized gene counts
+            filename = glob.glob('*GeneCounts_0.txt')[0]
+            CountsFile = pandas.read_table(filename, sep='\t',names=colnames_g)
+            CountsFile = CountsFile.sort_values(['gene'])
+            genes = list(CountsFile['gene'])                
+            counts = list(CountsFile['counts'])    
+            AllGeneCounts0['gene'] = genes     
+            AllGeneCounts0[replicate] = counts
+            os.chdir(AlnQCDir)            
+        # ------------------------------------------------
+        # Compute averages
+        # ------------------------------------------------ 
+        # sgRNA counts    
+        repl_counts = AllGuideCounts.iloc[:,2:]
+        if repl_avg == 'median':
+            avg_counts = repl_counts.median(axis=1)
+        elif repl_avg == 'mean':
+            avg_counts = repl_counts.mean(axis=1)
+        AllGuideCounts[treatment+'_avg'] = avg_counts
+        del_columns = range(2,2+R)
+        AllGuideCounts.drop(AllGuideCounts.columns[del_columns],axis=1,inplace=True) 
+        # normalized sgRNA counts    
+        repl_counts = AllGuideCounts0.iloc[:,2:]
+        if repl_avg == 'median':
+            avg_counts = repl_counts.median(axis=1)
+        elif repl_avg == 'mean':
+            avg_counts = repl_counts.mean(axis=1)
+        AllGuideCounts0[treatment+'_avg'] = avg_counts
+        del_columns = range(2,2+R)
+        AllGuideCounts0.drop(AllGuideCounts0.columns[del_columns],axis=1,inplace=True)
+        # gene counts    
+        repl_counts = AllGeneCounts.iloc[:,1:]
+        if repl_avg == 'median':
+            avg_counts = repl_counts.median(axis=1)
+        elif repl_avg == 'mean':
+            avg_counts = repl_counts.mean(axis=1)
+        AllGeneCounts[treatment+'_avg'] = avg_counts
+        del_columns = range(1,1+R)
+        AllGeneCounts.drop(AllGeneCounts.columns[del_columns],axis=1,inplace=True)
+        # normalized gene counts    
+        repl_counts = AllGeneCounts0.iloc[:,1:]
+        if repl_avg == 'median':
+            avg_counts = repl_counts.median(axis=1)
+        elif repl_avg == 'mean':
+            avg_counts = repl_counts.mean(axis=1)
+        AllGeneCounts0[treatment+'_avg'] = avg_counts
+        del_columns = range(1,1+R)
+        AllGeneCounts0.drop(AllGeneCounts0.columns[del_columns],axis=1,inplace=True)    
+        # ------------------------------------------------
+        # Write result dataframes
+        # ------------------------------------------------     
+        os.chdir(AlnQCDir)
+        if not os.path.exists(AvgDir):
+            os.makedirs(AvgDir)
+        os.chdir(AvgDir)
+        AllGuideCounts.to_csv(treatment+'_avg_GuideCounts.txt', sep = '\t', index = False, header = False)
+        AllGuideCounts0.to_csv(treatment+'_avg_GuideCounts_0.txt', sep = '\t', index = False, header = False)
+        AllGeneCounts.to_csv(treatment+'_avg_GeneCounts.txt', sep = '\t', index = False, header = False)
+        AllGeneCounts0.to_csv(treatment+'_avg_GeneCounts_0.txt', sep = '\t', index = False, header = False)
+    else:
+        print('(No replicates found)')
+
+
+    # --------------------------------------
+    # Time stamp
+    # --------------------------------------        
+    os.chdir(ScriptsDir)
+    end = time.time()
+    # Final time stamp
+    sec_elapsed = end - start
+    if sec_elapsed < 60:
+        time_elapsed = sec_elapsed
+        print('Time elapsed [secs]: ' + '%.3f' % time_elapsed)
+    elif sec_elapsed < 3600:
+        time_elapsed = sec_elapsed/60
+        print('Time elapsed [mins]: ' + '%.3f' % time_elapsed)
+    else:
+        time_elapsed = sec_elapsed/3600
+        print('Time elapsed [hours]: ' + '%.3f' % time_elapsed)    
+    
+if __name__ == "__main__":
+    input1 = sys.argv[1]    
+    AverageReadCounts(input1)     
@@ -29,7 +29,10 @@ def RunSeqQC():
     os.chdir(DataDir)
     FileNames = [d for d in os.listdir(DataDir)]
     for filename in FileNames:
-        os.system('fastqc -o '+SeqQCDir+' --extract '+filename)
+        if filename[-8:] == 'fastq.gz':
+            os.system('fastqc -o '+SeqQCDir+' --extract '+filename)
+        elif filename[-5:] == 'fastq':
+            os.system('fastqc -o '+SeqQCDir+' '+filename)
     os.chdir(SeqQCDir)    
     os.system('rm *.zip')
     os.chdir(ScriptsDir)