v2.7.6

PhilippSpahn · PhilippSpahn · commit 43166208e797 · 2017-09-15T11:08:05.000-07:00
diff --git a/Scripts/AlignReads.py b/Scripts/AlignReads.py
@@ -83,9 +83,6 @@ def MapAndCount(sample):
     AlnStemDir = config['AlignDir']
     AlnDir = AlnStemDir+sample+'/'
     OutputDir = config['AlnQCDir']+sample   
-    seq_5_end = config['seq_5_end']
-    CutErrorTol = config['CutErrorTol']
-    R_min = config['R_min']
     minN = config['Cutoff']
     LibFilename = config['LibFilename']
     LibFormat = LibFilename[-3:]
@@ -160,6 +157,8 @@ def MapAndCount(sample):
     # ------------------------------------------ 
     start = time.time()
     print('Analyzing alignment ...') 
+    print('Applying matching threshold ...')
+    print('Applying ambiguity threshold ...')    
     # CLASSIFY ALIGNMENTS 
     os.chdir(AlnDir)
     bw2outputFilename = ReadsFilename0 + '_bw2output.sam'
@@ -211,16 +210,19 @@ def MapAndCount(sample):
             NFail += 1
             AlnStatus.append('Fail')
     bw2sam.close();          
-
-    # ------------------------------------------
-    # Text output and plots
-    # ------------------------------------------ 
-    print('Writing alignment logfile ...')
     NReads = NTol + NAmb + NUnique + NFail
     FracUnique = round(NUnique/NReads*1000)/10    
     FracTol = round(NTol/NReads*1000)/10
     FracAmb = round(NAmb/NReads*1000)/10
-    FracFail = round(NFail/NReads*1000)/10    
+    FracFail = round(NFail/NReads*1000)/10         
+    print('*** Successfully mapped reads: '\
+        +str(NUnique+NTol)+' ('+str(FracUnique+FracTol)+'%) ***')
+    
+    
+    # ------------------------------------------
+    # Text output and plots
+    # ------------------------------------------ 
+    print('Writing alignment logfile ...')  
     if aln_time < 60:
         time_elapsed = aln_time
         time_unit = ' [secs]'
@@ -356,9 +358,6 @@ def MapAndCount(sample):
     for k in range(L):
         GuideCounts.write(str(sgIDs[k]) + '\t'+ str(geneIDs[k]) + '\t' + str(ReadsPerGuide[k]) + '\n')
     GuideCounts.close()
-    # No-mapping warning
-    if sum(ReadsPerGuide) == 0:
-        print('!! ERROR: Zero total read counts! Check library file and index !!')
     # Read counts per gene in library       
     print('Counting reads per gene ...')   
     global GeneList
@@ -373,6 +372,9 @@ def MapAndCount(sample):
     GeneCounts.close()        
     end = time.time()
     print('Read counting completed.')
+    # No-mapping warning
+    if sum(ReadsPerGuide) == 0:
+        print('### ERROR: Zero read counts! Check library and alignment ###')    
     # Time stamp
     sec_elapsed = end-start
     if sec_elapsed < 60:
diff --git a/Scripts/AnalyzeControl.py b/Scripts/AnalyzeControl.py
@@ -58,7 +58,7 @@ def EstimateControlCounts():
                                     'gene': genes},
                             columns = ['sgID','gene'])        
     if len(ControlSamples) == 0:
-        print('ERROR: No control sample directories found!')
+        print('### ERROR: No control sample directories found! ###')
     else:
         os.chdir(AlnQCDir)
         for controlsample in ControlSamples:
diff --git a/Scripts/Bowtie2.py b/Scripts/Bowtie2.py
@@ -14,7 +14,7 @@
 
 def BuildIndex(LibFastA,IndexDir,bw2Dir):
     os.chdir(IndexDir)
-    bw2_cmdline = bw2Dir+'bowtie2-build -f library.fasta Library'
+    bw2_cmdline = bw2Dir+'bowtie2-build -q -f library.fasta Library'
     os.system(bw2_cmdline)
 
 def RunBowtie2(ReadsFilename0,TempDataDir,AlnDir,bw2Dir,IndexDir,L_bw,N_bw,i_bw):   
diff --git a/Scripts/CheckCharacters.py b/Scripts/CheckCharacters.py
@@ -0,0 +1,113 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Oct 10 10:22:56 2016
+
+@author: philipp
+"""
+# Library sanity check
+# =======================================================================
+# Imports
+import yaml
+import sys
+import os
+import pandas
+
+def RunSanityCheck():
+    # ------------------------------------------------
+    # Get parameters
+    # ------------------------------------------------
+    configFile = open('configuration.yaml','r')
+    config = yaml.load(configFile)
+    configFile.close()
+    LibDir = config['LibDir']
+    LibFilename = config['LibFilename']
+    LibFormat = LibFilename[-3:]
+    if LibFormat == 'tsv':
+        libsep = '\t'
+    elif LibFormat == 'csv':
+        libsep = ','
+    DataDir = config['DataDir']
+    WorkingDir = config['WorkingDir']
+      
+    # --------------------------------------------------------------------
+    # Replace non-printable characters from library (...these cause problems in PlotCount.py)
+    # --------------------------------------------------------------------   
+    os.chdir(LibDir)
+    LibCols = ['gene','ID','seq']
+    LibFile = pandas.read_table(LibFilename, sep = libsep, skiprows = 1, names = LibCols)
+    GeneNames = list(LibFile['gene'])
+    ID = list(LibFile['ID'])
+    seq = list(LibFile['seq']) 
+    GeneNames0 = []
+    ID0 = []    
+    BadCharacters = [' ','>','<',';',':',',','|','/','\\','(',')','[',']',\
+        '$','%','*','?','{','}','=','+','@']
+    for gene in GeneNames:
+        for bad_char in BadCharacters:
+            gene = gene.replace(bad_char,'_')
+        GeneNames0.append(gene)
+    for sgRNA in ID:
+        for bad_char in BadCharacters:
+            sgRNA = sgRNA.replace(bad_char,'_')       
+        ID0.append(sgRNA)    
+    if GeneNames != GeneNames0 or ID != ID0:
+            LibFile0 = pandas.DataFrame(data = {'gene': [gene for gene in GeneNames0],
+                                     'ID': [sgRNA for sgRNA in ID0],
+                                     'seq': [s for s in seq]},
+                            columns = ['gene','ID','seq'])
+            LibFile0.to_csv(LibFilename, sep = libsep, index = False)
+            print("WARNING: Special characters in library file have been replaced by '_' ")
+
+    # --------------------------------------------------------------------
+    # Load Data Sheet
+    # -------------------------------------------------------------------- 
+    os.chdir(WorkingDir)
+    DataSheet = pandas.read_excel('DataSheet.xlsx')
+    Filenames = list(DataSheet['FILENAME'])
+    TreatmentList = list(DataSheet['TREATMENT'])
+    F = len(Filenames)
+    BadCharFound = False    
+    
+    # --------------------------------------------------------------------
+    # Replace non-printable characters from filenames
+    # --------------------------------------------------------------------      
+    os.chdir(DataDir)    
+    BadCharacters = [' ','>','<',';',':',',','|','/','\\','(',')','[',']',\
+        '$','%','*','?','{','}','=','+','@']
+    for j in range(F):
+        Filename = Filenames[j]
+        Filename0 = Filename
+        for bad_char in BadCharacters:
+            Filename0 = Filename0.replace(bad_char,'_')
+        if Filename0 != Filename:
+            BadCharFound = True
+            os.system('mv '+"'"+Filename+"'"+' '+Filename0)
+            DataSheet['FILENAME'][j] = Filename0            
+
+    # --------------------------------------------------------------------
+    # Replace non-printable characters from filenames
+    # --------------------------------------------------------------------             
+    TreatmentList0 = TreatmentList
+    for bad_char in BadCharacters:
+        TreatmentList0 = [str(treatment).replace(bad_char,'_') for treatment in TreatmentList0]
+    if TreatmentList0 != TreatmentList:
+        BadCharFound = True
+        DataSheet['TREATMENT'] = TreatmentList0        
+        
+    # --------------------------------------------------------------------
+    # Update Data Sheet
+    # --------------------------------------------------------------------            
+    if BadCharFound:
+        os.chdir(WorkingDir)
+        DataSheet.to_excel('DataSheet.xlsx',columns=['FILENAME','TREATMENT'])
+        print("WARNING: Special characters in sample names replaced by '_'")
+    else:
+        print('No special characters found.')
+    
+    
+
+
+   
+if __name__ == "__main__":
+    RunSanityCheck()
diff --git a/Scripts/FindHits.py b/Scripts/FindHits.py
@@ -46,7 +46,7 @@ def PrepareHitList(sample):
     ListDir = config['HitDir']
     CtrlCounts_Filename = 'Control_GuideCounts_0.tsv'
     ScreenType = config['ScreenType']
-    alpha = config['alpha']    
+    alpha = config['alpha_s']
     padj = config['padj']
     SheetFormat = config['HitListFormat']
     delta = config['delta']
@@ -128,7 +128,7 @@ def PrepareHitList(sample):
     # -----------------------------------------------------------                  
     else:                           # error in scree type
     # -----------------------------------------------------------   
-        print('ERROR: Check spelling of ScreenType in configuration file!')
+        print('### ERROR: Check spelling of ScreenType in configuration file! ###')
 
     # -----------------------------------------------
     # p-value Correction and Plots
@@ -162,10 +162,10 @@ def PrepareHitList(sample):
                                      'control stdev': [numpy.sqrt(sigma2[k]) for k in range(L)],
                                      'fold change': [fc[k] for k in range(L)],   
                                      'p-value': [NBpval[k] for k in range(L)],
-                                     'FDR': [NBpval_0[k] for k in range(L)],                                                 
+                                     'p-value (adj.)': [NBpval_0[k] for k in range(L)],                                                 
                                      'significant': [str(significant[k]) for k in range(L)]},
                             columns = ['sgRNA','gene','counts','control mean',\
-                            'control stdev','fold change','p-value','FDR','significant'])
+                            'control stdev','fold change','p-value','p-value (adj.)','significant'])
     if ScreenType == 'enrichment': 
         Results_df_0 = Results_df.sort_values(['significant','p-value','fold change','sgRNA'],ascending=[0,1,0,1])
     elif ScreenType == 'depletion': 
diff --git a/Scripts/LoadDataSheet.py b/Scripts/LoadDataSheet.py
@@ -24,8 +24,7 @@ def LoadExcelDataSheet():
     os.chdir(WorkingDir)
     DataSheet = pandas.read_excel('DataSheet.xlsx')
     FileNames = list(DataSheet['FILENAME'].values)
-    TreatmentList = list(DataSheet['TREATMENT'].values)
-    TreatmentList = [str(treatment).replace(' ','_') for treatment in TreatmentList]    # replace spaces   
+    TreatmentList = list(DataSheet['TREATMENT'])
     Treatments = list(set(TreatmentList))
     if 'Control' in Treatments:
         N = len(FileNames)
@@ -42,7 +41,7 @@ def LoadExcelDataSheet():
         DataSheet.to_excel('DataSheet.xlsx',columns=['FILENAME','TREATMENT','SAMPLE NAME'])
         os.chdir(ScriptsDir)
     else:
-        print('ERROR: No control treatment defined!')
+        print('### ERROR: No control treatment defined! ###')
     
     
 if __name__ == "__main__":
diff --git a/Scripts/NormalizeReadCounts.py b/Scripts/NormalizeReadCounts.py
@@ -214,7 +214,7 @@ def Normalization():
                 header=False,index=False)                        
             os.chdir(AlnQCDir)
     else:
-        print('ERROR: Check spelling of Normalization parameter in configuration file!')
+        print('### ERROR: Check spelling of Normalization parameter in configuration file! ###')
     
     # --------------------------------------
     # Time stamp
diff --git a/Scripts/PinAPL.py b/Scripts/PinAPL.py
@@ -56,8 +56,12 @@
 os.system('python -u PrintStatus.py Header blank 2>&1 | tee PinAPL-Py.log')
 start = time.time()
 
-# Library sanity check
+# Character sanity check
+StatMsg = 'Running character sanity check ...'
+os.system('python -u PrintStatus.py SubHeader "'+StatMsg+'" 2>&1 | tee -a PinAPL-Py.log')
 os.system('python -u '+SanityScript+'.py 2>&1 | tee -a PinAPL-Py.log')
+DoneMsg = 'Character sanity check completed.'
+os.system('python -u PrintStatus.py Done "'+DoneMsg+'" 2>&1 | tee -a PinAPL-Py.log')
 
 # Generate index if not present
 if not os.path.exists(IndexDir):
diff --git a/Scripts/PlotCounts.py b/Scripts/PlotCounts.py
@@ -42,7 +42,7 @@ def GOI_Scatterplot(sample,GOI='None',Annot='none',NonT='none',Transp='none'):
     PlotDir = config['ScatterDir']
     HiLiteDir = config['HiLiteDir']
     ScreenType = config['ScreenType']    
-    alpha = config['alpha']
+    alpha = config['alpha_s']
     delta = config['delta']
     NonTPrefix = config['NonTargetPrefix']
     res = config['dpi']
@@ -117,7 +117,7 @@ def GOI_Scatterplot(sample,GOI='None',Annot='none',NonT='none',Transp='none'):
     os.chdir(PlotDir)   
     fig,ax = plt.subplots(figsize=(4,4.25))
     plt.scatter(control_rest,sample_rest,s=dotsize,facecolor='black',lw=0,alpha=TransparencyLevel)
-    plt.scatter(control_sig,sample_sig,s=dotsize,facecolor='green',lw=0,alpha=tpcy,label='FDR<'+str(alpha))
+    plt.scatter(control_sig,sample_sig,s=dotsize,facecolor='green',lw=0,alpha=tpcy,label='p < '+str(alpha))
     if len(K_nonT)>0 and ShowNonTargets:
         plt.scatter(control_nonT,sample_nonT,s=dotsize,facecolor='orange',lw=0,alpha=0.35,\
             label='Non Targeting')
@@ -167,7 +167,7 @@ def GOI_Scatterplot(sample,GOI='None',Annot='none',NonT='none',Transp='none'):
         print('sgID\t\tCounts\tControl\tSignificant')    
         print('-----------------------------------------------')       
         if not K_goi:
-            print('ERROR: Gene name not found!')
+            print('### ERROR: Gene name not found! ###')
         else:            
             for k in K_goi:        
                 println = str(sgIDs[k])+'\t'+str(int(sample_counts[k]))+'\t'+ \
diff --git a/Scripts/PlotReplicates.py b/Scripts/PlotReplicates.py
@@ -42,7 +42,6 @@ def Repl_Scatterplot(Repl1,Repl2,GOI='None',Annot='none',NonT='none',Transp='non
     AlnQCDir = config['AlnQCDir']
     PlotDir = config['CorrelDir']
     HiLiteDir2 = config['HiLiteDir2']
-    alpha = config['alpha']
     delta = config['delta']
     NonTPrefix = config['NonTargetPrefix']
     res = config['dpi']
diff --git a/Scripts/PrintStatus.py b/Scripts/PrintStatus.py
@@ -12,8 +12,8 @@
 
 def PrintStatus_Header():
     print('**************************************************')
-    print('Launching PinAPL-Py v2.7.5..')
-    print('P. Spahn et al., UC San Diego (08/2017)')
+    print('Launching PinAPL-Py v2.7.6..')
+    print('P. Spahn et al., UC San Diego (09/2017)')
     print('**************************************************')
     
 def PrintStatus_SubHeader(msg):
diff --git a/Scripts/RankGenes.py b/Scripts/RankGenes.py
@@ -133,7 +133,7 @@ def GeneRankingAnalysis(sample):
     ListDir = config['HitDir']
     EffDir = config['EffDir']
     GeneDir = config['GeneDir']
-    alpha = config['alpha']            
+    alpha = config['alpha_g']            
     padj = config['padj']
     screentype = config['ScreenType']    
     num_cores = multiprocessing.cpu_count()
@@ -267,12 +267,12 @@ def GeneRankingAnalysis(sample):
                 GOI = geneList[g]
                 pval = ecdf(metric[g])
                 metric_pval.append(pval)
-            # Determine critical p value (FDR correction)
+            # Determine critical p value (p-value correction)
             multTest = multipletests(metric_pval,alpha,padj)
             metric_sig = multTest[0]
             metric_pval0 = multTest[1]
         else: # no control replicates
-            print('ERROR: Cannot compute aRRA scores without significant sgRNAs!')
+            print('### ERROR: Cannot compute aRRA scores without significant sgRNAs! ###')
             SortFlag = True
             metric = [-1 for k in range(G)]
             metric_pval = [-1 for k in range(G)]
@@ -363,13 +363,13 @@ def GeneRankingAnalysis(sample):
     print('Writing results dataframe ...')
     Results_df = pandas.DataFrame(data = {'gene': [geneList[g] for g in range(G)],
                                     GeneMetric: [metric[g] for g in range(G)],
-                                     GeneMetric+' p_value': [metric_pval[g] for g in range(G)],
-                                    GeneMetric+' FDR': [metric_pval0[g] for g in range(G)],
+                                     'p_value': [metric_pval[g] for g in range(G)],
+                                    'p_value (adj.)': [metric_pval0[g] for g in range(G)],
                                      'significant': [str(metric_sig[g]) for g in range(G)],
                                      '# sgRNAs': [nGuides[g] for g in range(G)],                
                                      '# signif. sgRNAs': [sigGuides[g] for g in range(G)],
                                     'avg. logFC': [AvgLogFCs[g] for g in range(G)]},
-                            columns = ['gene',GeneMetric,GeneMetric+' p_value',GeneMetric+' FDR',\
+                            columns = ['gene',GeneMetric,'p_value','p_value (adj.)',\
                             'significant','# sgRNAs','# signif. sgRNAs','avg. logFC'])
     Results_df_0 = Results_df.sort_values(['significant',GeneMetric],ascending=[False,SortFlag])
     GeneListFilename = filename[0:-14]+'_'+GeneMetric+'_'+'P'+str(Np)+'_GeneList.tsv'
diff --git a/Scripts/TrimReads.py b/Scripts/TrimReads.py
@@ -25,7 +25,8 @@ def RunCutadapt():
     TrimLogDir = config['TrimLogDir']
     seq_5_end = config['seq_5_end']
     CutErrorTol = config['CutErrorTol']
-    R_min = config['R_min']
+    sgLength = config['sgRNALength']
+    R_min = sgLength
     ScriptsDir = config['ScriptsDir']
     RunInBack = 'RunInBackground.sh'
     
@@ -44,13 +45,14 @@ def RunCutadapt():
         LogFilename = 'cutadapt_'+ReadsFilename+'.txt'
         command = CutAdaptDir+'cutadapt -g '+seq_5_end\
                                 +' '+DataDir+ReadsFilename+' -o '+ReadsFilename0\
-                                +' -e '+str(CutErrorTol)+' -m '+str(R_min)+' -l 20'\
+                                +' -e '+str(CutErrorTol)+' -m '+str(R_min)+' -l '+str(sgLength)\
                                 +' 2>&1 > '+LogFilename
         subprocess.call(ScriptsDir+RunInBack+' "'+command+'" '+ReadsFilename+' cutadapt_status.log &',shell=True,\
             stdin=None, stdout=None, stderr=None, close_fds=True)
         filesNotDone.append(ReadsFilename+'_cutadapt_status.log')
         print('Loading '+ReadsFilename)
     print('Removing adapters ...')
+    print('Extracting '+str(sgLength)+' bp sgRNA sequences ...')
     # Check for completion
     while len(filesNotDone) > 0:
         time.sleep(.1)
diff --git a/Scripts/pvalPlots.py b/Scripts/pvalPlots.py
diff --git a/configuration.yaml b/configuration.yaml