1- import std/ [os, strformat, strutils, parseopt]
1+ import std/ [os, strformat, strutils, parseopt, tables ]
22import ./ abif
33
44# # This module provides a command-line tool for converting ABIF files to FASTQ or FASTA format
@@ -21,6 +21,7 @@ import ./abif
2121# # -v, --verbose Print additional information
2222# # --version Show version information
2323# # --fasta Output in FASTA format instead of FASTQ
24+ # # -s, --split Split ambiguous bases into two sequences
2425# #
2526# # Examples:
2627# #
@@ -36,6 +37,9 @@ import ./abif
3637# #
3738# # # Convert to FASTA format
3839# # abi2fq --fasta input.ab1 output.fasta
40+ # #
41+ # # # Split ambiguous bases into two sequences
42+ # # abi2fq -s input.ab1 output.fastq
3943
4044type
4145 Config * = object
4953 verbose* : bool # # Whether to show verbose output
5054 showVersion* : bool # # Whether to show version information
5155 fasta* : bool # # Whether to output in FASTA format instead of FASTQ
56+ split* : bool # # Whether to split ambiguous bases into two sequences
5257
5358proc printHelp * () =
5459 # # Displays the help message for the abi2fq tool.
@@ -67,6 +72,7 @@ Options:
6772 -v, --verbose Print additional information
6873 --version Show version information
6974 --fasta Output in FASTA format instead of FASTQ
75+ -s, --split Split ambiguous bases into two sequences
7076
7177If output file is not specified, FASTQ will be written to STDOUT.
7278"""
@@ -90,7 +96,8 @@ proc parseCommandLine*(): Config =
9096 noTrim: false ,
9197 verbose: false ,
9298 showVersion: false ,
93- fasta: false
99+ fasta: false ,
100+ split: false
94101 )
95102
96103 var fileArgs: seq [string ] = @ []
@@ -125,6 +132,8 @@ proc parseCommandLine*(): Config =
125132 result .showVersion = true
126133 of " fasta" :
127134 result .fasta = true
135+ of " s" , " split" :
136+ result .split = true
128137 else :
129138 echo " Unknown option: " , key
130139 printHelp ()
@@ -192,29 +201,41 @@ proc trimSequence*(sequence: string, qualities: seq[int],
192201 result .seq = sequence[startPos ..< endPos]
193202 result .qual = qualities[startPos ..< endPos]
194203
195- proc writeFastq * (sequence: string , qualities: seq [int ], name: string , outFile: string = " " , fasta: bool = false ) =
204+ proc writeFastq * (sequence: string , qualities: seq [int ], name: string , outFile: string = " " , fasta: bool = false , splitSeq1: string = " " , splitSeq2: string = " " ) =
196205 # # Writes sequence and quality data to a FASTQ or FASTA file.
197206 # #
198207 # # If outFile is empty, the data is written to stdout.
199208 # # If fasta is true, the output will be in FASTA format instead of FASTQ.
209+ # # If splitSeq1 and splitSeq2 are not empty, writes them as two separate records.
200210 # #
201211 # # Parameters:
202- # # sequence: The DNA sequence to write
212+ # # sequence: The DNA sequence to write (used when not splitting)
203213 # # qualities: Quality scores for each base in the sequence
204214 # # name: The sample name for the header
205215 # # outFile: Path to the output file (empty string for stdout)
206216 # # fasta: Whether to output in FASTA format instead of FASTQ
217+ # # splitSeq1: First sequence when splitting ambiguous bases
218+ # # splitSeq2: Second sequence when splitting ambiguous bases
207219
208220 var content: string
209- if fasta:
210- # Create FASTA format
211- content = & " >{ name} \n { sequence} "
221+
222+ # Create quality string
223+ var qualityString = " "
224+ for qv in qualities:
225+ qualityString.add (chr (qv + 33 ))
226+
227+ if splitSeq1 != " " and splitSeq2 != " " :
228+ # Output split sequences
229+ if fasta:
230+ content = & " >{ name} _1\n { splitSeq1} \n >{ name} _2\n { splitSeq2} "
231+ else :
232+ content = & " @{ name} _1\n { splitSeq1} \n +\n { qualityString} \n @{ name} _2\n { splitSeq2} \n +\n { qualityString} "
212233 else :
213- # Create FASTQ format
214- var qualityString = " "
215- for qv in qualities:
216- qualityString. add ( chr (qv + 33 ))
217- content = & " @{ name} \n { sequence} \n +\n { qualityString} "
234+ # Output single sequence
235+ if fasta:
236+ content = & " > { name } \n { sequence } "
237+ else :
238+ content = & " @{ name} \n { sequence} \n +\n { qualityString} "
218239
219240 if outFile == " " :
220241 # Write to stdout
@@ -223,6 +244,49 @@ proc writeFastq*(sequence: string, qualities: seq[int], name: string, outFile: s
223244 # Write to file
224245 writeFile (outFile, content & " \n " )
225246
247+ proc splitAmbiguousBases * (sequence: string ): tuple [seq1: string , seq2: string ] =
248+ # # Splits ambiguous bases into two sequences.
249+ # #
250+ # # Splits sequence at every ambiguous base that represents exactly 2 alternatives.
251+ # # IUPAC ambiguity codes:
252+ # # - R = A or G
253+ # # - Y = C or T
254+ # # - S = G or C
255+ # # - W = A or T
256+ # # - K = G or T
257+ # # - M = A or C
258+ # #
259+ # # Parameters:
260+ # # sequence: The DNA sequence to split
261+ # #
262+ # # Returns:
263+ # # A tuple containing the two split sequences
264+
265+ # Define mapping of ambiguity codes to their nucleotide options
266+ let ambiguityMap = {
267+ 'R' : @ ['A' , 'G' ],
268+ 'Y' : @ ['C' , 'T' ],
269+ 'S' : @ ['G' , 'C' ],
270+ 'W' : @ ['A' , 'T' ],
271+ 'K' : @ ['G' , 'T' ],
272+ 'M' : @ ['A' , 'C' ]
273+ }.toTable
274+
275+ var seq1 = " "
276+ var seq2 = " "
277+
278+ for base in sequence:
279+ if base in ambiguityMap and ambiguityMap[base].len == 2 :
280+ # Ambiguous base with exactly 2 options
281+ seq1.add (ambiguityMap[base][0 ])
282+ seq2.add (ambiguityMap[base][1 ])
283+ else :
284+ # Non-ambiguous or other ambiguous base
285+ seq1.add (base)
286+ seq2.add (base)
287+
288+ return (seq1, seq2)
289+
226290proc main * () =
227291 # # Main entry point for the abi2fq program.
228292 # #
@@ -236,6 +300,7 @@ proc main*() =
236300 echo & " Window size: { config.windowSize} "
237301 echo & " Quality threshold: { config.qualityThreshold} "
238302 echo & " Trimming: { not config.noTrim} "
303+ echo & " Split ambiguous bases: { config.split} "
239304 if config.fasta:
240305 echo " Output format: FASTA"
241306 else :
@@ -303,7 +368,13 @@ proc main*() =
303368 if endPos < sequence.len:
304369 modifiedSeq.add (sequence[endPos ..< sequence.len].toLowerAscii ())
305370
306- writeFastq (modifiedSeq, qualities, sampleName, config.outFile, config.fasta)
371+ if config.split:
372+ let split = splitAmbiguousBases (modifiedSeq)
373+ if config.verbose:
374+ echo " Splitting ambiguous bases into two sequences"
375+ writeFastq (modifiedSeq, qualities, sampleName, config.outFile, config.fasta, split.seq1, split.seq2)
376+ else :
377+ writeFastq (modifiedSeq, qualities, sampleName, config.outFile, config.fasta)
307378 else :
308379 # Trim low quality ends
309380 let trimmed = trimSequence (sequence, qualities, config.windowSize, config.qualityThreshold)
@@ -313,7 +384,13 @@ proc main*() =
313384 if trimmed.seq .len == 0 :
314385 echo " Warning: Entire sequence was below quality threshold"
315386
316- writeFastq (trimmed.seq , trimmed.qual, sampleName, config.outFile, config.fasta)
387+ if config.split:
388+ let split = splitAmbiguousBases (trimmed.seq )
389+ if config.verbose:
390+ echo " Splitting ambiguous bases into two sequences"
391+ writeFastq (trimmed.seq , trimmed.qual, sampleName, config.outFile, config.fasta, split.seq1, split.seq2)
392+ else :
393+ writeFastq (trimmed.seq , trimmed.qual, sampleName, config.outFile, config.fasta)
317394
318395 trace.close ()
319396 except :
0 commit comments