Merge branch 'solexa'

StuntsPT · StuntsPT · commit af779af0992a · 2016-08-02T17:52:00.000+01:00
diff --git a/4Pipe4.py b/4Pipe4.py
@@ -41,9 +41,16 @@
 The arguments can be given in any order.",
                                  prog="4Pipe4",
                                  formatter_class=RawTextHelpFormatter)
-parser.add_argument("-i", dest="infile", nargs=1, required=True,
-                    help="Provide the full path to your target sff file\n",
-                    metavar="sff_file")
+
+group = parser.add_mutually_exclusive_group(required=True)
+group.add_argument("-i", dest="infile", nargs=1, required=False,
+                    help="Provide the full path to your target input file\n",
+                    metavar="input_file")
+group.add_argument("-p", dest="infile", nargs=2, required=False,
+                    help="Provide the full path to your target input pair \
+                    files. Currentlly only woring for solexa data type.\n",
+                    metavar="input_pair")
+
 parser.add_argument("-o", dest="outfile", nargs=1, required=True,
                     help="Provide the full path to your results directory, \
 plus the name you want to give your results\n",
@@ -63,6 +70,12 @@
 extraction\n\t2 - SeqClean\n\t3 - Mira\n\t4 - DiscoveryTCS\n\t5 - \
 SNP grabber\n\t6 - ORF finder\n\t7 - Blast2go\n\t8 - SSR finder\n\t9 - 7zip \
 the report")
+parser.add_argument("-d", dest="datatype", help="Declare the type of \
+data being used. Currentlly suported are 454 (454) and Illumina (solexa). \
+Default is 454.", required=False, metavar="454/solexa", default="454")
+# parser.add_argument("-p", dest="paired", nargs="?", default=False, type=bool,
+#                     help="Is the data paired end? True/False, default is \
+#                     False.", required=False, metavar="True/False")
 arg = parser.parse_args()
 
 
@@ -80,8 +93,29 @@ def loading(current_state, size, prefix, width):
 
 
 def StartUp():
+    """
+    Make some basic checks regarding user input.
+    """
     basefile = os.path.abspath("".join(arg.outfile))
-    sff = os.path.abspath("".join(arg.infile))
+    input_file = [os.path.abspath("".join(x)) for x in arg.infile]
+
+    # Solexa checks
+    if arg.datatype == "solexa":
+        if "1" in arg.run_list or "2" in arg.run_list:
+            quit("Please skip steps 1 and 2 for illumina data. They are not required.")
+        for inputs in input_file:
+            if inputs.endswith(("fastq", "fastq.gz")) is False:
+                quit("Infile must be in 'fastq' format for illumina data.")
+            if os.path.isfile(basefile + ".fastq"):
+                if basefile + ".fastq" == inputs:
+                    pass
+                else:
+                    quit(basefile + " already exists. Please deal with it \
+                         before proceeding.")
+            elif len(input_file) == 1:
+                os.symlink(inputs, arg.outfile + ".fastq")
+
+
     if arg.configFile is not None:
         rcfile = os.path.abspath("".join(arg.configFile))
     elif os.path.isfile('4Pipe4rc'):
@@ -101,7 +135,7 @@ def StartUp():
     except:
         print("\nERROR: Invalid configuration file\n")
         quit("Please run 4Pipe4.py -h for help with running the pipeline.")
-    return basefile, sff, config
+    return basefile, input_file, config
 
 
 def SysPrep(basefile):
@@ -139,8 +173,10 @@ def RunProgram(cli, requires_output):
 
 
 def SffExtraction(sff, basefile):
-    '''Function for using the sff_extractor module. It will look for an "ideal"
-    clipping value using multiple runs before outputting the final files.'''
+    """
+    Function for using the sff_extractor module. It will look for an "ideal"
+    clipping value using multiple runs before outputting the final files.
+    """
     clip_found = 0
 
     # Sff_extractor parameters:
@@ -160,7 +196,7 @@ def SffExtraction(sff, basefile):
     sff_config["seq_fname"] = basefile + ".fasta"
 
     while clip_found < 2:
-        extra_clip = sff_extractor.extract_reads_from_sff(sff_config, [sff])
+        extra_clip = sff_extractor.extract_reads_from_sff(sff_config, sff[0])
         sff_config["min_leftclip"] += extra_clip
         if extra_clip == 0:
             clip_found += 1
@@ -195,17 +231,31 @@ def SeqClean(basefile):
 
 
 def MiraRun(basefile):
-    '''Assemble the sequences and write the menifest file'''
+    """
+    Write the manifest file and assemble the sequences.
+    """
     basename = os.path.basename(basefile)
     manifest = open(basefile + ".manifest", 'w')
     manifest.write("project = " + basename + "\n")
     manifest.write(config.get('Mira Parameters', 'mirajob') + "\n")
     manifest.write(config.get('Mira Parameters', 'miracommon') + " -GE:not="
                    + config.get('Variables', 'seqcores') + " \\\n")
-    manifest.write(config.get('Mira Parameters', 'mira454') + "\n\n")
+    if arg.datatype == "454":
+        manifest.write(config.get('Mira Parameters', 'mira454') + "\n\n")
+    elif arg.datatype == "solexa":
+        manifest.write(config.get('Mira Parameters', 'mirasolexa') + "\n\n")
     manifest.write(config.get('Mira Parameters', 'mirareadgroup') + "\n")
+    if len(arg.infile) == 2:
+        manifest.write("autopairing\n")
     manifest.write(config.get('Mira Parameters', 'miratech') + "\n")
-    manifest.write("data = " + basename + ".clean.fasta\n")
+    if arg.datatype == "454":
+        manifest.write("data = " + basename + ".clean.fasta\n")
+    elif arg.datatype == "solexa":
+        if len(arg.infile) == 1:
+            manifest.write("data = " + os.path.abspath(arg.infile[0]) + "\n")
+        else:
+            manifest.write("data = " + os.path.abspath(arg.infile[0]) + " " +
+                           os.path.abspath(arg.infile[1]) + "\n")
     manifest.close()
 
     # Run mira
@@ -280,17 +330,24 @@ def ORFliner(basefile):
     print("\nRunning NCBI 'blastx' using the following command:")
     print(' '.join(cli))
     RunProgram(cli, 0)
+
     # Then we write the metrics report:
     print("\nRunning the metrics calculator module...")
     seqclean_log_path = "%s/seqcl_%s.fasta.log" % (os.path.split(basefile)[0],
                                                    miraproject)
-    Metrics.Run_module(seqclean_log_path, basefile + '.fasta',
-                       basefile + '.clean.fasta', basefile + '.fasta.qual',
-                       basefile + '.clean.fasta.qual',
-                       basefile + '_assembly/' + miraproject + '_d_info/'
-                       + miraproject + '_info_assembly.txt', basefile
-                       + '.SNPs.fasta', basefile + '.BestORF.fasta',
-                       basefile + '.Metrics.html')
+    if arg.datatype == "454":
+        Metrics.Run_module(seqclean_log_path, basefile + '.fasta',
+                           basefile + '.clean.fasta', basefile + '.fasta.qual',
+                           basefile + '.clean.fasta.qual',
+                           basefile + '_assembly/' + miraproject + '_d_info/'
+                           + miraproject + '_info_assembly.txt', basefile
+                           + '.SNPs.fasta', basefile + '.BestORF.fasta',
+                           basefile + '.Metrics.html')
+    else:
+        Metrics.Run_as_solexa(basefile + '_assembly/' + miraproject + '_d_info/'
+                             + miraproject + '_info_assembly.txt', basefile
+                             + '.SNPs.fasta', basefile + '.BestORF.fasta',
+                             basefile + '.Metrics.html')
     # Finally we write down our report using the data gathered so far:
     print("\nRunning Reporter module...")
     Reporter.RunModule(basefile + '.BestORF.fasta', basefile + '.SNPs.fasta',
diff --git a/4Pipe4rc b/4Pipe4rc
@@ -41,9 +41,9 @@ min_len = 50
 #Number of CPU cores to be used by seqclean, BLAST and mira:
 seqcores = 10
 #Minumim base coverage to accept as a putative SNP:
-mincov = 15
+mincov = 15 # 20 or 25 are better suited for illumina data
 #Minimum average base quality to accept as a putative SNP:
-minqual = 70
+minqual = 70 # 60 will suffice for illumina data
 #Minimum contig quality to accept SSR:
 min_ssr_qual = 70
 
@@ -54,11 +54,13 @@ min_ssr_qual = 70
 #mirajob = job = est,denovo,accurate
 #miracommon = parameters = COMMON_SETTINGS -AS:nop=5:ugpf=off -CO:mr=on:asir=on -OUT:output_result_caf=off:output_result_maf=on
 #mira454 = 454_SETTINGS -AL:egp=off -CL:cpat=on
+#mirasolexa = SOLEXA_SETTINGS -CO:asir=yes AL:egp=off
 #mirareadgroup = readgroup = Test data
 #miratech = technology = 454
 
 mirajob =
 miracommon =
 mira454 =
+mirasolexa = 
 mirareadgroup =
 miratech =
diff --git a/Metrics.py b/Metrics.py
@@ -44,7 +44,7 @@ def Read_qual_metrics(qual_file):
             quals += lines
     qual.close()
     qual_avg = "%.2f" % (sum(quals)/len(quals))
-    
+
     return(qual_avg)
 
 
@@ -159,26 +159,28 @@ def Metrics_writer(dataset_info, contig_info, snp_info, metrics_file):
 TABLE,THEAD,TBODY,TFOOT,TR,TH,TD,P { font-family:"Arial"; font-size:small }\
 \n        -->\n        </STYLE>\n    </HEAD>\n<BODY>\n')
     metrics_file.write("<H1>4Pipe4 metrics report:</H1>\n")
-    metrics_file.write("<H2>Dataset metrics:</H2>\n")
-    metrics_file.write("<p>Average read length (before cleaning): "
-                       + str(dataset_info[1][0]) + "</p>\n")
-    metrics_file.write("<p>Maximum read length (before cleaning): "
-                       + str(dataset_info[1][1]) + "</p>\n")
-    metrics_file.write("<p>Median of read length (before cleaning): "
-                       + str(dataset_info[1][2]) + "</p>\n")
-    metrics_file.write("<p>Average base quality (before cleaning): "
-                       + str(dataset_info[3]) + "</p>\n")
-    metrics_file.write("<H3>SeqClean report:</H3>")
-    for lines in dataset_info[0]:
-        metrics_file.write("<p>" + lines + "</p>")
-    metrics_file.write("<p>Average read length (after cleaning): "
-                       + str(dataset_info[2][0]) + "</p>\n")
-    metrics_file.write("<p>Maximum read length (after cleaning): "
-                       + str(dataset_info[2][1]) + "</p>\n")
-    metrics_file.write("<p>Median of read length (after cleaning): "
-                       + str(dataset_info[2][2]) + "</p>\n")
-    metrics_file.write("<p>Average base quality (after cleaning): "
-                       + str(dataset_info[4]) + "</p>\n")
+    # Write these metrics for 454 only.
+    if dataset_info != "solxa":
+        metrics_file.write("<H2>Dataset metrics:</H2>\n")
+        metrics_file.write("<p>Average read length (before cleaning): "
+                           + str(dataset_info[1][0]) + "</p>\n")
+        metrics_file.write("<p>Maximum read length (before cleaning): "
+                           + str(dataset_info[1][1]) + "</p>\n")
+        metrics_file.write("<p>Median of read length (before cleaning): "
+                           + str(dataset_info[1][2]) + "</p>\n")
+        metrics_file.write("<p>Average base quality (before cleaning): "
+                           + str(dataset_info[3]) + "</p>\n")
+        metrics_file.write("<H3>SeqClean report:</H3>")
+        for lines in dataset_info[0]:
+            metrics_file.write("<p>" + lines + "</p>")
+        metrics_file.write("<p>Average read length (after cleaning): "
+                           + str(dataset_info[2][0]) + "</p>\n")
+        metrics_file.write("<p>Maximum read length (after cleaning): "
+                           + str(dataset_info[2][1]) + "</p>\n")
+        metrics_file.write("<p>Median of read length (after cleaning): "
+                           + str(dataset_info[2][2]) + "</p>\n")
+        metrics_file.write("<p>Average base quality (after cleaning): "
+                           + str(dataset_info[4]) + "</p>\n")
 
     metrics_file.write("<H2>Contig metrics:</H2>\n")
     metrics_file.write("<p>Number of reads assembled: "
@@ -234,14 +236,28 @@ def Run_module(seqclean_log_file, original_fasta_file, clean_fasta_file,
                original_fasta_qual_file, clean_fasta_qual_file,
                info_assembly_file, snps_fasta_file, bestorf_fasta_file,
                metrics_file):
-    '''Run the module'''
+    """
+    Run the module
+    """
     dataset_info = Dataset_gather(seqclean_log_file, original_fasta_file,
                                   clean_fasta_file, original_fasta_qual_file,
                                   clean_fasta_qual_file)
     contig_info = Contig_gather(info_assembly_file)
     snp_info = SNP_gather(snps_fasta_file, bestorf_fasta_file)
     Metrics_writer(dataset_info, contig_info, snp_info, metrics_file)
 
+
+def Run_as_solexa(info_assembly_file, snps_fasta_file, bestorf_fasta_file,
+                  metrics_file):
+    """
+    Run the module with solexa data.
+    """
+    contig_info = Contig_gather(info_assembly_file)
+    snp_info = SNP_gather(snps_fasta_file, bestorf_fasta_file)
+    dataset_info = "solexa"
+    Metrics_writer(dataset_info, contig_info, snp_info, metrics_file)
+
+
 if __name__ == "__main__":
     # Usage: python3 Metrics.py (view Run_module() for a list of arguments)
     from sys import argv
diff --git a/README.md b/README.md
@@ -97,7 +97,7 @@ available on any \*nix machine you have access to but don't have root access.)
 4. Generate pre-configured entries for all of the above ready to be copied &
 pasted into 4Pipe4rc.
 
-These scripts should significantlly speed up the instalation process of these
+These scripts should significantly speed up the installation process of these
 external 4Pipe4 programs.
 
 By default these scripts will install all the software to "~/Software", but this
@@ -116,7 +116,7 @@ usage: 4Pipe4 [-h] -i sff_file -o basefile [-c configfile] [-s [RUN_LIST]]
 
 optional arguments:
   -h, --help     show this help message and exit
-  -i sff_file    Provide the full path to your target sff file
+  -i input_file    Provide the full path to your target input file
   -o basefile    Provide the full path to your results directory, plus the name you want to give your results
   -c configfile  Provide the full path to your configuration file. If none is provided, the program will look in the current working directory and  then in ~/.config/4Pipe4rc (in this order) for one. If none is found the  program will stop
   -s [RUN_LIST]  Specify the numbers corresponding to the pipeline steps that will be run. The string after -s must be given inside quotation marks, and numbers can be joined together or separated by any symbol. The numbers are the pipeline steps that should be run. This is an optional argument and it's omission will run all steps by default'. The numbers, from 1 to 9 represent the following steps:
@@ -130,17 +130,28 @@ optional arguments:
                         8 - SSR finder
                         9 - 7zip the report
 
+  -d 454/solexa    Declare the type of data being used. Currentlly suported are 454 (454) and Illumina (solexa). Default is 454.
+  -p [True/False]  Is the data paired end? True/False, default is                     False.
+
 The idea here is that to resume an analysis that was interrupted for example after the assembling process you should issue -s '4,5,6,7,8,9' or -s '456789'. Note that some steps depend on the output of previous steps, so using some combinations can cause errors. The arguments can be given in any order.
 ```
 
 --------------------------------------------
 
-If you wish to run the entire pipeline, just issue something like
+If you wish to run the entire pipeline on 454 data, just issue something like
 
 ```
 python3 4Pipe4.py -i /path/to/file.sff -o /path/to/results/basefilename
 ```
 
+However, if you wish to run the pipeline with Illumina data, skip steps 1 and 2,
+and add the "-d solexa" switch:
+
+```
+python3 4Pipe4.py -i /path/to/reads.fastq -o /path/to/results/basefilename\
+-d solexa -s 3,4,5,6,7,8,9
+```
+
 Use the -s option to specify only the steps you wish to run from the analysis
 and the -c option to point 4Pipe4 to a specific configuration file.
 
@@ -151,6 +162,8 @@ purposes, as well as documentation on how to do an example run of 4Pipe4.
 
 The configuration file contains information on every option. You should change
 those options to reflect your own system and SNP detection preferences.
+Do not forget that the helper scripts will generate most of the config file
+for you if you wish.
 
 ### CONTACT
 
diff --git a/SNPgrabber.py b/SNPgrabber.py
@@ -27,8 +27,8 @@ def TCStoDict(tcs_file, minqual):
 
     for lines in tcs:
         name = re.match('^\w*', lines).group(0)  # Contig name
-        quals = re.split(' *', re.search('\|.{16}\|', lines).
-                         group(0)[2:-2].strip())
+        quals = lines.split("|")[3].split()
+
         SNP = ''
         for q, b in zip(quals[:-1], ['A', 'C', 'G', 'T']):
             try: