Skip to content

Commit 7e3f7d0

Browse files
committed
Update module
1 parent 98134bf commit 7e3f7d0

File tree

3 files changed

+119
-25
lines changed

3 files changed

+119
-25
lines changed

src/abi2fq.nim

Lines changed: 91 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import std/[os, strformat, strutils, parseopt]
1+
import std/[os, strformat, strutils, parseopt, tables]
22
import ./abif
33

44
## This module provides a command-line tool for converting ABIF files to FASTQ or FASTA format
@@ -21,6 +21,7 @@ import ./abif
2121
## -v, --verbose Print additional information
2222
## --version Show version information
2323
## --fasta Output in FASTA format instead of FASTQ
24+
## -s, --split Split ambiguous bases into two sequences
2425
##
2526
## Examples:
2627
##
@@ -36,6 +37,9 @@ import ./abif
3637
##
3738
## # Convert to FASTA format
3839
## abi2fq --fasta input.ab1 output.fasta
40+
##
41+
## # Split ambiguous bases into two sequences
42+
## abi2fq -s input.ab1 output.fastq
3943

4044
type
4145
Config* = object
@@ -49,6 +53,7 @@ type
4953
verbose*: bool ## Whether to show verbose output
5054
showVersion*: bool ## Whether to show version information
5155
fasta*: bool ## Whether to output in FASTA format instead of FASTQ
56+
split*: bool ## Whether to split ambiguous bases into two sequences
5257

5358
proc printHelp*() =
5459
## Displays the help message for the abi2fq tool.
@@ -67,6 +72,7 @@ Options:
6772
-v, --verbose Print additional information
6873
--version Show version information
6974
--fasta Output in FASTA format instead of FASTQ
75+
-s, --split Split ambiguous bases into two sequences
7076
7177
If output file is not specified, FASTQ will be written to STDOUT.
7278
"""
@@ -90,7 +96,8 @@ proc parseCommandLine*(): Config =
9096
noTrim: false,
9197
verbose: false,
9298
showVersion: false,
93-
fasta: false
99+
fasta: false,
100+
split: false
94101
)
95102

96103
var fileArgs: seq[string] = @[]
@@ -125,6 +132,8 @@ proc parseCommandLine*(): Config =
125132
result.showVersion = true
126133
of "fasta":
127134
result.fasta = true
135+
of "s", "split":
136+
result.split = true
128137
else:
129138
echo "Unknown option: ", key
130139
printHelp()
@@ -192,29 +201,41 @@ proc trimSequence*(sequence: string, qualities: seq[int],
192201
result.seq = sequence[startPos ..< endPos]
193202
result.qual = qualities[startPos ..< endPos]
194203

195-
proc writeFastq*(sequence: string, qualities: seq[int], name: string, outFile: string = "", fasta: bool = false) =
204+
proc writeFastq*(sequence: string, qualities: seq[int], name: string, outFile: string = "", fasta: bool = false, splitSeq1: string = "", splitSeq2: string = "") =
196205
## Writes sequence and quality data to a FASTQ or FASTA file.
197206
##
198207
## If outFile is empty, the data is written to stdout.
199208
## If fasta is true, the output will be in FASTA format instead of FASTQ.
209+
## If splitSeq1 and splitSeq2 are not empty, writes them as two separate records.
200210
##
201211
## Parameters:
202-
## sequence: The DNA sequence to write
212+
## sequence: The DNA sequence to write (used when not splitting)
203213
## qualities: Quality scores for each base in the sequence
204214
## name: The sample name for the header
205215
## outFile: Path to the output file (empty string for stdout)
206216
## fasta: Whether to output in FASTA format instead of FASTQ
217+
## splitSeq1: First sequence when splitting ambiguous bases
218+
## splitSeq2: Second sequence when splitting ambiguous bases
207219

208220
var content: string
209-
if fasta:
210-
# Create FASTA format
211-
content = &">{name}\n{sequence}"
221+
222+
# Create quality string
223+
var qualityString = ""
224+
for qv in qualities:
225+
qualityString.add(chr(qv + 33))
226+
227+
if splitSeq1 != "" and splitSeq2 != "":
228+
# Output split sequences
229+
if fasta:
230+
content = &">{name}_1\n{splitSeq1}\n>{name}_2\n{splitSeq2}"
231+
else:
232+
content = &"@{name}_1\n{splitSeq1}\n+\n{qualityString}\n@{name}_2\n{splitSeq2}\n+\n{qualityString}"
212233
else:
213-
# Create FASTQ format
214-
var qualityString = ""
215-
for qv in qualities:
216-
qualityString.add(chr(qv + 33))
217-
content = &"@{name}\n{sequence}\n+\n{qualityString}"
234+
# Output single sequence
235+
if fasta:
236+
content = &">{name}\n{sequence}"
237+
else:
238+
content = &"@{name}\n{sequence}\n+\n{qualityString}"
218239

219240
if outFile == "":
220241
# Write to stdout
@@ -223,6 +244,49 @@ proc writeFastq*(sequence: string, qualities: seq[int], name: string, outFile: s
223244
# Write to file
224245
writeFile(outFile, content & "\n")
225246

247+
proc splitAmbiguousBases*(sequence: string): tuple[seq1: string, seq2: string] =
248+
## Splits ambiguous bases into two sequences.
249+
##
250+
## Splits sequence at every ambiguous base that represents exactly 2 alternatives.
251+
## IUPAC ambiguity codes:
252+
## - R = A or G
253+
## - Y = C or T
254+
## - S = G or C
255+
## - W = A or T
256+
## - K = G or T
257+
## - M = A or C
258+
##
259+
## Parameters:
260+
## sequence: The DNA sequence to split
261+
##
262+
## Returns:
263+
## A tuple containing the two split sequences
264+
265+
# Define mapping of ambiguity codes to their nucleotide options
266+
let ambiguityMap = {
267+
'R': @['A', 'G'],
268+
'Y': @['C', 'T'],
269+
'S': @['G', 'C'],
270+
'W': @['A', 'T'],
271+
'K': @['G', 'T'],
272+
'M': @['A', 'C']
273+
}.toTable
274+
275+
var seq1 = ""
276+
var seq2 = ""
277+
278+
for base in sequence:
279+
if base in ambiguityMap and ambiguityMap[base].len == 2:
280+
# Ambiguous base with exactly 2 options
281+
seq1.add(ambiguityMap[base][0])
282+
seq2.add(ambiguityMap[base][1])
283+
else:
284+
# Non-ambiguous or other ambiguous base
285+
seq1.add(base)
286+
seq2.add(base)
287+
288+
return (seq1, seq2)
289+
226290
proc main*() =
227291
## Main entry point for the abi2fq program.
228292
##
@@ -236,6 +300,7 @@ proc main*() =
236300
echo &"Window size: {config.windowSize}"
237301
echo &"Quality threshold: {config.qualityThreshold}"
238302
echo &"Trimming: {not config.noTrim}"
303+
echo &"Split ambiguous bases: {config.split}"
239304
if config.fasta:
240305
echo "Output format: FASTA"
241306
else:
@@ -303,7 +368,13 @@ proc main*() =
303368
if endPos < sequence.len:
304369
modifiedSeq.add(sequence[endPos ..< sequence.len].toLowerAscii())
305370

306-
writeFastq(modifiedSeq, qualities, sampleName, config.outFile, config.fasta)
371+
if config.split:
372+
let split = splitAmbiguousBases(modifiedSeq)
373+
if config.verbose:
374+
echo "Splitting ambiguous bases into two sequences"
375+
writeFastq(modifiedSeq, qualities, sampleName, config.outFile, config.fasta, split.seq1, split.seq2)
376+
else:
377+
writeFastq(modifiedSeq, qualities, sampleName, config.outFile, config.fasta)
307378
else:
308379
# Trim low quality ends
309380
let trimmed = trimSequence(sequence, qualities, config.windowSize, config.qualityThreshold)
@@ -313,7 +384,13 @@ proc main*() =
313384
if trimmed.seq.len == 0:
314385
echo "Warning: Entire sequence was below quality threshold"
315386

316-
writeFastq(trimmed.seq, trimmed.qual, sampleName, config.outFile, config.fasta)
387+
if config.split:
388+
let split = splitAmbiguousBases(trimmed.seq)
389+
if config.verbose:
390+
echo "Splitting ambiguous bases into two sequences"
391+
writeFastq(trimmed.seq, trimmed.qual, sampleName, config.outFile, config.fasta, split.seq1, split.seq2)
392+
else:
393+
writeFastq(trimmed.seq, trimmed.qual, sampleName, config.outFile, config.fasta)
317394

318395
trace.close()
319396
except:

src/abimerge.nim

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import ./abif
1818
## -m, --min-overlap INT Minimum overlap length for merging (default: 20)
1919
## -o, --output STRING Output file name (default: STDOUT)
2020
## -j, --join INT Join with gap of INT Ns if no overlap detected
21+
## --fasta Output in FASTA format instead of FASTQ
2122
## --score-match INT Score for a match (default: 10)
2223
## --score-mismatch INT Score for a mismatch (default: -8)
2324
## --score-gap INT Score for a gap (default: -10)
@@ -35,6 +36,9 @@ import ./abif
3536
##
3637
## # Join sequences with N gap if no overlap
3738
## abimerge -j 10 forward.ab1 reverse.ab1 merged.fastq
39+
##
40+
## # Output in FASTA format instead of FASTQ
41+
## abimerge --fasta forward.ab1 reverse.ab1 merged.fasta
3842

3943
type
4044
swAlignment* = object
@@ -270,6 +274,7 @@ type
270274
qualityThreshold*: int # Quality threshold for trimming
271275
noTrim*: bool # Whether to disable quality trimming
272276
showVersion*: bool # Whether to show version information
277+
fasta*: bool # Whether to output in FASTA format
273278

274279
proc printHelp() =
275280
echo """
@@ -284,6 +289,7 @@ Options:
284289
-o, --output STRING Output file name (default: STDOUT)
285290
-j, --join INT If no overlap is detected join the two sequences with a gap of INT Ns
286291
(reverse complement the second sequence)
292+
--fasta Output in FASTA format instead of FASTQ
287293
Quality Trimming Options:
288294
-w, --window=INT Window size for quality trimming (default: 4)
289295
-q, --quality=INT Quality threshold 0-60 (default: 22)
@@ -352,7 +358,8 @@ proc parseCommandLine(): Config =
352358
windowSize: 4, # Default window size for quality trimming
353359
qualityThreshold: 22, # Default quality threshold
354360
noTrim: false, # Enable trimming by default
355-
showVersion: false # Don't show version by default
361+
showVersion: false, # Don't show version by default
362+
fasta: false # Default to FASTQ format
356363
)
357364

358365
var fileArgs: seq[string] = @[]
@@ -377,6 +384,8 @@ proc parseCommandLine(): Config =
377384
if result.joinGap < 0:
378385
echo "Error: Join gap must not be negative"
379386
quit(1)
387+
of "fasta":
388+
result.fasta = true
380389
# Quality trimming options
381390
of "w", "window":
382391
result.windowSize = parseInt(val)
@@ -770,20 +779,25 @@ proc mergeSequences*(forwardSeq: string, forwardQual: seq[int],
770779
result.seq = mergedSeq
771780
result.qual = mergedQual
772781

773-
proc writeFastq(sequence: string, qualities: seq[int], name: string, outFile: string = "") =
774-
# Convert quality values to Phred+33 format
775-
var qualityString = ""
776-
for qv in qualities:
777-
qualityString.add(chr(qv + 33))
782+
proc writeSequence(sequence: string, qualities: seq[int], name: string, outFile: string = "", fastaMode: bool = false) =
783+
var content: string
778784

779-
let fastqContent = &"@{name}_merged\n{sequence}\n+\n{qualityString}"
785+
if fastaMode:
786+
# FASTA format - just header and sequence, no quality scores
787+
content = &">{name}_merged\n{sequence}"
788+
else:
789+
# FASTQ format - header, sequence, + line, and quality scores
790+
var qualityString = ""
791+
for qv in qualities:
792+
qualityString.add(chr(qv + 33))
793+
content = &"@{name}_merged\n{sequence}\n+\n{qualityString}"
780794

781795
if outFile == "":
782796
# Write to stdout
783-
stdout.write(fastqContent & "\n")
797+
stdout.write(content & "\n")
784798
else:
785799
# Write to file
786-
writeFile(outFile, fastqContent & "\n")
800+
writeFile(outFile, content & "\n")
787801

788802
proc main() =
789803
let config = parseCommandLine()
@@ -793,6 +807,7 @@ proc main() =
793807
echo " Forward: ", config.inputFileF
794808
echo " Reverse: ", config.inputFileR
795809
echo " Output: ", if config.outputFile == "": "STDOUT" else: config.outputFile
810+
echo " Output format: ", if config.fasta: "FASTA" else: "FASTQ"
796811
echo "Parameters:"
797812
echo " Minimum overlap: ", config.minOverlap
798813
echo " Match score: ", config.scoreMatch
@@ -865,8 +880,8 @@ proc main() =
865880
# Use sample name from forward read as the merged read name
866881
let mergedName = nameF
867882

868-
# Write output FASTQ
869-
writeFastq(merged.seq, merged.qual, mergedName, config.outputFile)
883+
# Write output in FASTA or FASTQ format
884+
writeSequence(merged.seq, merged.qual, mergedName, config.outputFile, config.fasta)
870885

871886
# Close traces
872887
traceF.close()

tests/test.fasta

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
>JB.Sample1A-PrimerA-F_merged
2+
ATGTTAGAGCTACAACTACTAATATTCTTGATAAACTTTATACTCCGCCATATAATTGTGATGGTAAAGATAGATGTTGTCAACTTATTATTAATAGTGCTCAAAGAATTACTTGGAAAGAAGTTGAATCTATTGAAGAATTAGGAGAAAGTGAACGTGGAAACAAAGGATTTGGAGAAGGAACCGGAGGAGCAGCTAAAGCTTAAAGTTGGTGCTCGTTATATTCATAATAAAACGTCTAATGAATATATAATTTTTAGTATTACTAAAATGAAACATCCAGATACAGGTGAATGGATTCCTGCTGTTATTTATAGAATTGATGGACTTGAACCTTTATGGTGTAGAAATGTTGAAAATTTTAATAGTCATTTTATAGATGCTAAGGTTGAGAGGTTTGAATTTTATCAATGAAAAAGTTAATCATTTTTTATTTAGCTGTGTGAGATAACTTTAAACTTTATGAAGAAAGGATGCTCATGGTTCTCGTGAGTTCTCAGGTAGCCGTAATGTTTTTTCTTATAATCTATAATCTGTATCCGGAAGCCAAGAAGCGAGTAGCGCGGGTATTAGTGATGCTCGTACTGCTTATTGCTAAGGTGATATTTGACATTGTGTGCAAAGCGATTACACTATACGCTATTATCTTCGCAAGTCGCTAGGTGATTCATAGGTCATAGTCTTTTGCCCTTCACCGAATTTTAAATTTTAATGTGATTAATCTATCACGATATGAATATCATTCAATGTTGGGCTTTAAAATAGCTAATTTTATAAATTCTCATTATATTATACGAATACTATGGTTAAAATTGAATTTTATTATAAAAGTGCTGATAAAGATAAAACAGAAGCTATGCGAGAAGCTATTGATATAGCTTTATTTGGTACTAATGTTCAATGTAATTTTAAAAATCTTCCTGACCATCTTATTCTTGAAGATATGATACTTGAAAAGGCTGTTGTACTGAATATCCTACTTGTATTATATATCGAGATGATACAGAATATAAACGATATAGTAATTCTGTTACTTGGGAAGAACTTCGTAATGATATTAATTATCTTACTGGAGATGAACCTACAAGACAAACAAATAATATATTTGTTGAAGCGTTTATTGATGAACATGATTGTATAACTCGTGCTAAATGTGCTGATGCTATTGCTTGGATGTGGAAATATCAGAATACTAAAGTAGAATATATTCAAACT

0 commit comments

Comments
 (0)