Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions ishlib/formats/bench_fastxpp_read_once.mojo
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import sys
from ishlib.vendor.kseq import FastxReader, BufferedReader
from ishlib.vendor.zlib import GZFile


@no_inline
fn bench_fastxpp_read_once(path: String) raises -> (Int, Int, Int):
var rdr = FastxReader[read_comment=False](BufferedReader(GZFile(path, "r")))
while True:
var n = rdr.read_fastxpp_read_once()
if n < 0:
break
return (0, 0, 0)


fn main() raises:
var argv = sys.argv()
if len(argv) < 2:
print("Usage: bench_fastxpp_read_once <file>")
return

var path = String(argv[1])
r, s, t = bench_fastxpp_read_once(path)
23 changes: 23 additions & 0 deletions ishlib/formats/bench_fastxpp_strip_newline.mojo
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import sys
from ishlib.vendor.kseq import FastxReader, BufferedReader
from ishlib.vendor.zlib import GZFile


@no_inline
fn bench_fastxpp_strip_newline(path: String) raises -> (Int, Int, Int):
var rdr = FastxReader[read_comment=False](BufferedReader(GZFile(path, "r")))
while True:
var n = rdr.read_fastxpp_strip_newline()
if n < 0:
break
return (0, 0, 0)


fn main() raises:
var argv = sys.argv()
if len(argv) < 2:
print("Usage: bench_fastxpp_strip_newline <file>")
return

var path = String(argv[1])
r, s, t = bench_fastxpp_strip_newline(path)
23 changes: 23 additions & 0 deletions ishlib/formats/bench_fastxpp_swar.mojo
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import sys
from ishlib.vendor.kseq import FastxReader, BufferedReader
from ishlib.vendor.zlib import GZFile


@no_inline
fn bench_fastxpp_swar(path: String) raises -> (Int, Int, Int):
var rdr = FastxReader[read_comment=False](BufferedReader(GZFile(path, "r")))
while True:
var n = rdr.read_fastxpp_swar()
if n < 0:
break
return (0, 0, 0)


fn main() raises:
var argv = sys.argv()
if len(argv) < 2:
print("Usage: bench_fastxpp_swar <file>")
return

var path = String(argv[1])
r, s, t = bench_fastxpp_swar(path)
23 changes: 23 additions & 0 deletions ishlib/formats/bench_original.mojo
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import sys
from ishlib.vendor.kseq import FastxReader, BufferedReader
from ishlib.vendor.zlib import GZFile


@no_inline
fn bench_original(path: String) raises -> (Int, Int, Int):
# Read the file using 'read' calls, do nothing with the data
var rdr = FastxReader[read_comment=False](BufferedReader(GZFile(path, "r")))
while rdr.read() > 0:
pass
return (0, 0, 0)


fn main() raises:
var argv = sys.argv()
if len(argv) < 2:
print("Usage: bench_original <file>")
return

var path = String(argv[1])

r, s, t = bench_original(path)
183 changes: 183 additions & 0 deletions ishlib/formats/generate_fastxpp.mojo
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
import sys
from collections import Optional
from ExtraMojo.io.buffered import BufferedReader, BufferedWriter
from collections import List # dynamic grow-able buffer
from memory import Span # view into the List for zero-copy writes

# ---------- helpers -------------------------------------------------


fn string_count(s: String) -> Int:
var n: Int = 0
for _ in s.codepoints():
n = n + 1
return n


fn read_line(mut rdr: BufferedReader) raises -> String:
var buf = List[UInt8]()
var n = rdr.read_until(buf, ord("\n"))
if n == 0:
return ""
var s = String()
s.write_bytes(Span(buf))
return s


# ---------- FASTX++ builder -----------------------------------------


fn generate_fastxpp(
marker: String,
header: String,
seq_lines: List[String],
qualities: Optional[List[String]] = None,
) -> String:
var bpl = string_count(seq_lines[0]) + 1 # bases + LF
var seq_len: Int = 0
for i in range(len(seq_lines)):
seq_len = seq_len + string_count(seq_lines[i])

var meta = String(string_count(header)) + ":" + String(
seq_len
) + ":" + String(len(seq_lines))

var rec = marker + "`" + meta + "`" + header + "\n"

for i in range(len(seq_lines)):
rec.write(seq_lines[i], "\n")

if qualities:
var q = qualities.value()
rec += "+\n"
for i in range(len(q)):
rec.write(q[i], "\n")

return rec


fn generate_fastxpp_bpl(
marker: String,
header: String,
seq_lines: List[String],
qualities: Optional[List[String]] = None,
) -> String:
var bpl = string_count(seq_lines[0]) + 1 # bases + LF
var slen = (bpl - 1) * (len(seq_lines) - 1) + # (bases per full line)
string_count(seq_lines[-1]) # + last (ragged) line
var meta = String(string_count(header)) + ":" +
String(slen) + ":" +
String(len(seq_lines)) + ":" +
String(bpl)
var rec = marker + "`" + meta + "`" + header + "\n"
for i in range(len(seq_lines)):
rec.write(seq_lines[i], "\n")
if qualities:
var q = qualities.value()
for i in range(len(q)):
rec.write(q[i], "\n")
return rec

# Helper: encode an unsigned ≤9-digit value as zero-padded ASCII.
fn to_ascii_padded(value: Int, width: Int) -> String:
# build the decimal text first …
var digits = String(value) # e.g. "123"
var pad = width - string_count(digits) # how many zeros needed

# … then emit into a single pre-sized String
var out = String(capacity=width)
for _ in range(pad):
out.write("0")
out.write(digits) # concat is zero-copy
return out # length == width

fn generate_fastxpp_bpl_fixed(
marker: String,
header: String,
seq_lines: List[String],
qualities: Optional[List[String]] = None,
) -> String:

# --- numeric fields ------------------------------------------------
var bpl = string_count(seq_lines[0]) + 1 # incl. LF
var slen = (bpl - 1) * (len(seq_lines) - 1) +
string_count(seq_lines[-1])

# --- fixed-width metadata block ------------------------------------
var meta = "`" +
#to_ascii_padded(string_count(header), 6) + # hlen
to_ascii_padded(slen, 9) + # slen
to_ascii_padded(len(seq_lines), 7) + # nlin
to_ascii_padded(bpl, 3) + # bpl
"`"

# --- assemble record -----------------------------------------------
var rec = marker + meta + header + "\n"
for i in range(len(seq_lines)):
rec.write(seq_lines[i], "\n")
if qualities:
var q = qualities.value()
for i in range(len(q)):
rec.write(q[i], "\n")
return rec

# ---------- main ----------------------------------------------------


fn main() raises:
var argv = sys.argv()
if len(argv) != 3:
print(
"Usage: mojo run generate_fastxpp.mojo <input.fastx>"
" <output.fastxpp>"
)
return

var reader = BufferedReader(
open(String(argv[1]), "r"), buffer_capacity=128 * 1024
)
var writer = BufferedWriter(
open(String(argv[2]), "w"), buffer_capacity=128 * 1024
)

var pending_header = String() # carries a header we already read

while True:
var header_line = pending_header
if header_line == "":
header_line = read_line(reader)
pending_header = String()

if header_line == "":
break

var marker = String(header_line[0:1])
var header = String(header_line[1:])

var seq = List[String]()
var line: String

while True:
line = read_line(reader)
if line == "":
break
if (
line.startswith(">")
or line.startswith("@")
or (marker == "@" and line.startswith("+"))
):
pending_header = line # save for the next record
break
seq.append(line)

var qual: Optional[List[String]] = None
if marker == "@" and line.startswith("+"):
var qlines = List[String]()
for _ in range(len(seq)):
qlines.append(read_line(reader))
qual = Optional[List[String]](qlines)

writer.write(generate_fastxpp_bpl_fixed(marker, header, seq, qual))

writer.flush()
writer.close()
Loading