Skip to content

Commit af779af

Browse files
committed
Merge branch 'solexa'
2 parents 70c7b8f + 302463c commit af779af

File tree

5 files changed

+135
-47
lines changed

5 files changed

+135
-47
lines changed

4Pipe4.py

Lines changed: 75 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,16 @@
4141
The arguments can be given in any order.",
4242
prog="4Pipe4",
4343
formatter_class=RawTextHelpFormatter)
44-
parser.add_argument("-i", dest="infile", nargs=1, required=True,
45-
help="Provide the full path to your target sff file\n",
46-
metavar="sff_file")
44+
45+
group = parser.add_mutually_exclusive_group(required=True)
46+
group.add_argument("-i", dest="infile", nargs=1, required=False,
47+
help="Provide the full path to your target input file\n",
48+
metavar="input_file")
49+
group.add_argument("-p", dest="infile", nargs=2, required=False,
50+
help="Provide the full path to your target input pair \
51+
files. Currentlly only woring for solexa data type.\n",
52+
metavar="input_pair")
53+
4754
parser.add_argument("-o", dest="outfile", nargs=1, required=True,
4855
help="Provide the full path to your results directory, \
4956
plus the name you want to give your results\n",
@@ -63,6 +70,12 @@
6370
extraction\n\t2 - SeqClean\n\t3 - Mira\n\t4 - DiscoveryTCS\n\t5 - \
6471
SNP grabber\n\t6 - ORF finder\n\t7 - Blast2go\n\t8 - SSR finder\n\t9 - 7zip \
6572
the report")
73+
parser.add_argument("-d", dest="datatype", help="Declare the type of \
74+
data being used. Currentlly suported are 454 (454) and Illumina (solexa). \
75+
Default is 454.", required=False, metavar="454/solexa", default="454")
76+
# parser.add_argument("-p", dest="paired", nargs="?", default=False, type=bool,
77+
# help="Is the data paired end? True/False, default is \
78+
# False.", required=False, metavar="True/False")
6679
arg = parser.parse_args()
6780

6881

@@ -80,8 +93,29 @@ def loading(current_state, size, prefix, width):
8093

8194

8295
def StartUp():
96+
"""
97+
Make some basic checks regarding user input.
98+
"""
8399
basefile = os.path.abspath("".join(arg.outfile))
84-
sff = os.path.abspath("".join(arg.infile))
100+
input_file = [os.path.abspath("".join(x)) for x in arg.infile]
101+
102+
# Solexa checks
103+
if arg.datatype == "solexa":
104+
if "1" in arg.run_list or "2" in arg.run_list:
105+
quit("Please skip steps 1 and 2 for illumina data. They are not required.")
106+
for inputs in input_file:
107+
if inputs.endswith(("fastq", "fastq.gz")) is False:
108+
quit("Infile must be in 'fastq' format for illumina data.")
109+
if os.path.isfile(basefile + ".fastq"):
110+
if basefile + ".fastq" == inputs:
111+
pass
112+
else:
113+
quit(basefile + " already exists. Please deal with it \
114+
before proceeding.")
115+
elif len(input_file) == 1:
116+
os.symlink(inputs, arg.outfile + ".fastq")
117+
118+
85119
if arg.configFile is not None:
86120
rcfile = os.path.abspath("".join(arg.configFile))
87121
elif os.path.isfile('4Pipe4rc'):
@@ -101,7 +135,7 @@ def StartUp():
101135
except:
102136
print("\nERROR: Invalid configuration file\n")
103137
quit("Please run 4Pipe4.py -h for help with running the pipeline.")
104-
return basefile, sff, config
138+
return basefile, input_file, config
105139

106140

107141
def SysPrep(basefile):
@@ -139,8 +173,10 @@ def RunProgram(cli, requires_output):
139173

140174

141175
def SffExtraction(sff, basefile):
142-
'''Function for using the sff_extractor module. It will look for an "ideal"
143-
clipping value using multiple runs before outputting the final files.'''
176+
"""
177+
Function for using the sff_extractor module. It will look for an "ideal"
178+
clipping value using multiple runs before outputting the final files.
179+
"""
144180
clip_found = 0
145181

146182
# Sff_extractor parameters:
@@ -160,7 +196,7 @@ def SffExtraction(sff, basefile):
160196
sff_config["seq_fname"] = basefile + ".fasta"
161197

162198
while clip_found < 2:
163-
extra_clip = sff_extractor.extract_reads_from_sff(sff_config, [sff])
199+
extra_clip = sff_extractor.extract_reads_from_sff(sff_config, sff[0])
164200
sff_config["min_leftclip"] += extra_clip
165201
if extra_clip == 0:
166202
clip_found += 1
@@ -195,17 +231,31 @@ def SeqClean(basefile):
195231

196232

197233
def MiraRun(basefile):
198-
'''Assemble the sequences and write the menifest file'''
234+
"""
235+
Write the manifest file and assemble the sequences.
236+
"""
199237
basename = os.path.basename(basefile)
200238
manifest = open(basefile + ".manifest", 'w')
201239
manifest.write("project = " + basename + "\n")
202240
manifest.write(config.get('Mira Parameters', 'mirajob') + "\n")
203241
manifest.write(config.get('Mira Parameters', 'miracommon') + " -GE:not="
204242
+ config.get('Variables', 'seqcores') + " \\\n")
205-
manifest.write(config.get('Mira Parameters', 'mira454') + "\n\n")
243+
if arg.datatype == "454":
244+
manifest.write(config.get('Mira Parameters', 'mira454') + "\n\n")
245+
elif arg.datatype == "solexa":
246+
manifest.write(config.get('Mira Parameters', 'mirasolexa') + "\n\n")
206247
manifest.write(config.get('Mira Parameters', 'mirareadgroup') + "\n")
248+
if len(arg.infile) == 2:
249+
manifest.write("autopairing\n")
207250
manifest.write(config.get('Mira Parameters', 'miratech') + "\n")
208-
manifest.write("data = " + basename + ".clean.fasta\n")
251+
if arg.datatype == "454":
252+
manifest.write("data = " + basename + ".clean.fasta\n")
253+
elif arg.datatype == "solexa":
254+
if len(arg.infile) == 1:
255+
manifest.write("data = " + os.path.abspath(arg.infile[0]) + "\n")
256+
else:
257+
manifest.write("data = " + os.path.abspath(arg.infile[0]) + " " +
258+
os.path.abspath(arg.infile[1]) + "\n")
209259
manifest.close()
210260

211261
# Run mira
@@ -280,17 +330,24 @@ def ORFliner(basefile):
280330
print("\nRunning NCBI 'blastx' using the following command:")
281331
print(' '.join(cli))
282332
RunProgram(cli, 0)
333+
283334
# Then we write the metrics report:
284335
print("\nRunning the metrics calculator module...")
285336
seqclean_log_path = "%s/seqcl_%s.fasta.log" % (os.path.split(basefile)[0],
286337
miraproject)
287-
Metrics.Run_module(seqclean_log_path, basefile + '.fasta',
288-
basefile + '.clean.fasta', basefile + '.fasta.qual',
289-
basefile + '.clean.fasta.qual',
290-
basefile + '_assembly/' + miraproject + '_d_info/'
291-
+ miraproject + '_info_assembly.txt', basefile
292-
+ '.SNPs.fasta', basefile + '.BestORF.fasta',
293-
basefile + '.Metrics.html')
338+
if arg.datatype == "454":
339+
Metrics.Run_module(seqclean_log_path, basefile + '.fasta',
340+
basefile + '.clean.fasta', basefile + '.fasta.qual',
341+
basefile + '.clean.fasta.qual',
342+
basefile + '_assembly/' + miraproject + '_d_info/'
343+
+ miraproject + '_info_assembly.txt', basefile
344+
+ '.SNPs.fasta', basefile + '.BestORF.fasta',
345+
basefile + '.Metrics.html')
346+
else:
347+
Metrics.Run_as_solexa(basefile + '_assembly/' + miraproject + '_d_info/'
348+
+ miraproject + '_info_assembly.txt', basefile
349+
+ '.SNPs.fasta', basefile + '.BestORF.fasta',
350+
basefile + '.Metrics.html')
294351
# Finally we write down our report using the data gathered so far:
295352
print("\nRunning Reporter module...")
296353
Reporter.RunModule(basefile + '.BestORF.fasta', basefile + '.SNPs.fasta',

4Pipe4rc

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,9 @@ min_len = 50
4141
#Number of CPU cores to be used by seqclean, BLAST and mira:
4242
seqcores = 10
4343
#Minumim base coverage to accept as a putative SNP:
44-
mincov = 15
44+
mincov = 15 # 20 or 25 are better suited for illumina data
4545
#Minimum average base quality to accept as a putative SNP:
46-
minqual = 70
46+
minqual = 70 # 60 will suffice for illumina data
4747
#Minimum contig quality to accept SSR:
4848
min_ssr_qual = 70
4949

@@ -54,11 +54,13 @@ min_ssr_qual = 70
5454
#mirajob = job = est,denovo,accurate
5555
#miracommon = parameters = COMMON_SETTINGS -AS:nop=5:ugpf=off -CO:mr=on:asir=on -OUT:output_result_caf=off:output_result_maf=on
5656
#mira454 = 454_SETTINGS -AL:egp=off -CL:cpat=on
57+
#mirasolexa = SOLEXA_SETTINGS -CO:asir=yes AL:egp=off
5758
#mirareadgroup = readgroup = Test data
5859
#miratech = technology = 454
5960

6061
mirajob =
6162
miracommon =
6263
mira454 =
64+
mirasolexa =
6365
mirareadgroup =
6466
miratech =

Metrics.py

Lines changed: 38 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def Read_qual_metrics(qual_file):
4444
quals += lines
4545
qual.close()
4646
qual_avg = "%.2f" % (sum(quals)/len(quals))
47-
47+
4848
return(qual_avg)
4949

5050

@@ -159,26 +159,28 @@ def Metrics_writer(dataset_info, contig_info, snp_info, metrics_file):
159159
TABLE,THEAD,TBODY,TFOOT,TR,TH,TD,P { font-family:"Arial"; font-size:small }\
160160
\n -->\n </STYLE>\n </HEAD>\n<BODY>\n')
161161
metrics_file.write("<H1>4Pipe4 metrics report:</H1>\n")
162-
metrics_file.write("<H2>Dataset metrics:</H2>\n")
163-
metrics_file.write("<p>Average read length (before cleaning): "
164-
+ str(dataset_info[1][0]) + "</p>\n")
165-
metrics_file.write("<p>Maximum read length (before cleaning): "
166-
+ str(dataset_info[1][1]) + "</p>\n")
167-
metrics_file.write("<p>Median of read length (before cleaning): "
168-
+ str(dataset_info[1][2]) + "</p>\n")
169-
metrics_file.write("<p>Average base quality (before cleaning): "
170-
+ str(dataset_info[3]) + "</p>\n")
171-
metrics_file.write("<H3>SeqClean report:</H3>")
172-
for lines in dataset_info[0]:
173-
metrics_file.write("<p>" + lines + "</p>")
174-
metrics_file.write("<p>Average read length (after cleaning): "
175-
+ str(dataset_info[2][0]) + "</p>\n")
176-
metrics_file.write("<p>Maximum read length (after cleaning): "
177-
+ str(dataset_info[2][1]) + "</p>\n")
178-
metrics_file.write("<p>Median of read length (after cleaning): "
179-
+ str(dataset_info[2][2]) + "</p>\n")
180-
metrics_file.write("<p>Average base quality (after cleaning): "
181-
+ str(dataset_info[4]) + "</p>\n")
162+
# Write these metrics for 454 only.
163+
if dataset_info != "solxa":
164+
metrics_file.write("<H2>Dataset metrics:</H2>\n")
165+
metrics_file.write("<p>Average read length (before cleaning): "
166+
+ str(dataset_info[1][0]) + "</p>\n")
167+
metrics_file.write("<p>Maximum read length (before cleaning): "
168+
+ str(dataset_info[1][1]) + "</p>\n")
169+
metrics_file.write("<p>Median of read length (before cleaning): "
170+
+ str(dataset_info[1][2]) + "</p>\n")
171+
metrics_file.write("<p>Average base quality (before cleaning): "
172+
+ str(dataset_info[3]) + "</p>\n")
173+
metrics_file.write("<H3>SeqClean report:</H3>")
174+
for lines in dataset_info[0]:
175+
metrics_file.write("<p>" + lines + "</p>")
176+
metrics_file.write("<p>Average read length (after cleaning): "
177+
+ str(dataset_info[2][0]) + "</p>\n")
178+
metrics_file.write("<p>Maximum read length (after cleaning): "
179+
+ str(dataset_info[2][1]) + "</p>\n")
180+
metrics_file.write("<p>Median of read length (after cleaning): "
181+
+ str(dataset_info[2][2]) + "</p>\n")
182+
metrics_file.write("<p>Average base quality (after cleaning): "
183+
+ str(dataset_info[4]) + "</p>\n")
182184

183185
metrics_file.write("<H2>Contig metrics:</H2>\n")
184186
metrics_file.write("<p>Number of reads assembled: "
@@ -234,14 +236,28 @@ def Run_module(seqclean_log_file, original_fasta_file, clean_fasta_file,
234236
original_fasta_qual_file, clean_fasta_qual_file,
235237
info_assembly_file, snps_fasta_file, bestorf_fasta_file,
236238
metrics_file):
237-
'''Run the module'''
239+
"""
240+
Run the module
241+
"""
238242
dataset_info = Dataset_gather(seqclean_log_file, original_fasta_file,
239243
clean_fasta_file, original_fasta_qual_file,
240244
clean_fasta_qual_file)
241245
contig_info = Contig_gather(info_assembly_file)
242246
snp_info = SNP_gather(snps_fasta_file, bestorf_fasta_file)
243247
Metrics_writer(dataset_info, contig_info, snp_info, metrics_file)
244248

249+
250+
def Run_as_solexa(info_assembly_file, snps_fasta_file, bestorf_fasta_file,
251+
metrics_file):
252+
"""
253+
Run the module with solexa data.
254+
"""
255+
contig_info = Contig_gather(info_assembly_file)
256+
snp_info = SNP_gather(snps_fasta_file, bestorf_fasta_file)
257+
dataset_info = "solexa"
258+
Metrics_writer(dataset_info, contig_info, snp_info, metrics_file)
259+
260+
245261
if __name__ == "__main__":
246262
# Usage: python3 Metrics.py (view Run_module() for a list of arguments)
247263
from sys import argv

README.md

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ available on any \*nix machine you have access to but don't have root access.)
9797
4. Generate pre-configured entries for all of the above ready to be copied &
9898
pasted into 4Pipe4rc.
9999

100-
These scripts should significantlly speed up the instalation process of these
100+
These scripts should significantly speed up the installation process of these
101101
external 4Pipe4 programs.
102102

103103
By default these scripts will install all the software to "~/Software", but this
@@ -116,7 +116,7 @@ usage: 4Pipe4 [-h] -i sff_file -o basefile [-c configfile] [-s [RUN_LIST]]
116116
117117
optional arguments:
118118
-h, --help show this help message and exit
119-
-i sff_file Provide the full path to your target sff file
119+
-i input_file Provide the full path to your target input file
120120
-o basefile Provide the full path to your results directory, plus the name you want to give your results
121121
-c configfile Provide the full path to your configuration file. If none is provided, the program will look in the current working directory and then in ~/.config/4Pipe4rc (in this order) for one. If none is found the program will stop
122122
-s [RUN_LIST] Specify the numbers corresponding to the pipeline steps that will be run. The string after -s must be given inside quotation marks, and numbers can be joined together or separated by any symbol. The numbers are the pipeline steps that should be run. This is an optional argument and it's omission will run all steps by default'. The numbers, from 1 to 9 represent the following steps:
@@ -130,17 +130,28 @@ optional arguments:
130130
8 - SSR finder
131131
9 - 7zip the report
132132
133+
-d 454/solexa Declare the type of data being used. Currentlly suported are 454 (454) and Illumina (solexa). Default is 454.
134+
-p [True/False] Is the data paired end? True/False, default is False.
135+
133136
The idea here is that to resume an analysis that was interrupted for example after the assembling process you should issue -s '4,5,6,7,8,9' or -s '456789'. Note that some steps depend on the output of previous steps, so using some combinations can cause errors. The arguments can be given in any order.
134137
```
135138

136139
--------------------------------------------
137140

138-
If you wish to run the entire pipeline, just issue something like
141+
If you wish to run the entire pipeline on 454 data, just issue something like
139142

140143
```
141144
python3 4Pipe4.py -i /path/to/file.sff -o /path/to/results/basefilename
142145
```
143146

147+
However, if you wish to run the pipeline with Illumina data, skip steps 1 and 2,
148+
and add the "-d solexa" switch:
149+
150+
```
151+
python3 4Pipe4.py -i /path/to/reads.fastq -o /path/to/results/basefilename\
152+
-d solexa -s 3,4,5,6,7,8,9
153+
```
154+
144155
Use the -s option to specify only the steps you wish to run from the analysis
145156
and the -c option to point 4Pipe4 to a specific configuration file.
146157

@@ -151,6 +162,8 @@ purposes, as well as documentation on how to do an example run of 4Pipe4.
151162

152163
The configuration file contains information on every option. You should change
153164
those options to reflect your own system and SNP detection preferences.
165+
Do not forget that the helper scripts will generate most of the config file
166+
for you if you wish.
154167

155168
### CONTACT
156169

SNPgrabber.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@ def TCStoDict(tcs_file, minqual):
2727

2828
for lines in tcs:
2929
name = re.match('^\w*', lines).group(0) # Contig name
30-
quals = re.split(' *', re.search('\|.{16}\|', lines).
31-
group(0)[2:-2].strip())
30+
quals = lines.split("|")[3].split()
31+
3232
SNP = ''
3333
for q, b in zip(quals[:-1], ['A', 'C', 'G', 'T']):
3434
try:

0 commit comments

Comments
 (0)