Skip to content

Commit 8a8ab20

Browse files
committed
GTF support
1 parent 16201f5 commit 8a8ab20

File tree

5 files changed

+127
-32
lines changed

5 files changed

+127
-32
lines changed

bin/covtotarget

65.9 KB
Binary file not shown.

bin/gff2bed

69.7 KB
Binary file not shown.

scripts/benchmarking/make-bam-targeted.py

Lines changed: 29 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -114,20 +114,35 @@ def makeTarget(genomeSize, targetSize, numFeatures=1):
114114
target[start] = end
115115
return target
116116

117+
def savetarget(target, file):
118+
"""
119+
Save target in bed format
120+
"""
121+
bed = open(file, "w")
122+
chrname = "chromosome"
123+
124+
for t in target:
125+
name = f"chromosome:{t}-{target[t]}"
126+
print(f"{chrname}\t{t}\t{target[t]}\t{name}", file=bed)
127+
128+
129+
117130

118131

119132
if __name__ == "__main__":
120133
parser = argparse.ArgumentParser(description="Simulate a BAM file with long reads mapped against a hypothetical genome")
121134
parser.add_argument("-o", "--output", help="Output BAM file", required=True)
122135
parser.add_argument("-l", "--length", help="Length of the genome [default: %(default)s]", default="100M")
123136

124-
parser.add_argument("-n", "--num-reads", help="Number of reads [default: %(default)s]", default="100M")
137+
parser.add_argument("-n", "--num-reads", help="Number of reads [default: %(default)s]", default="1M")
125138
parser.add_argument("-t", "--target-size", help="Bases in target [default: %(default)s]", default="10M")
126139
parser.add_argument("-f", "--target-features", help="Number of features [default: %(default)s]", type=int, default=10)
127140

128-
parser.add_argument("-m", "--min-len", help="Minimum read length [default: 1000]", type=int, default=1000)
129-
parser.add_argument("-M", "--max-len", help="Minimum read length [default: 10000]", type=int,default=10000)
141+
parser.add_argument("-m", "--min-len", help="Minimum read length [default: 1000]", type=int, default=50)
142+
parser.add_argument("-M", "--max-len", help="Minimum read length [default: 10000]", type=int,default=300)
130143
parser.add_argument("-s", "--seed", help="Random seed [default: 42]", type=int,default=42)
144+
parser.add_argument("--multiply", help="Multiply the number of reads by this number", type=int, default=1)
145+
parser.add_argument("--randomcigar", help="Use random CIGAR strings", action="store_true")
131146
parser.add_argument("--progress", help="Print progress every INT reads [default: 10000]", type=int, default=10000)
132147

133148

@@ -144,14 +159,18 @@ def makeTarget(genomeSize, targetSize, numFeatures=1):
144159
bamHeader = makeHeader(genome)
145160
target = makeTarget(genomeSize, stringSizeToInt(opts.target_size), opts.target_features)
146161

162+
bedOutput = opts.output.replace("bam", "bed")
163+
savetarget(target, bedOutput)
147164
totalReads = stringSizeToInt(opts.num_reads)
148165
generatedBases = 0
149166
n = 0
150167
seqPosLength = {}
151168

152-
while n < totalReads:
169+
for n in track(range(totalReads), total=totalReads, description="Generating positions..."):
170+
#while n < totalReads:
171+
#n += 1
153172
readLength = int(opts.min_len + (opts.max_len - opts.min_len) * random.random())
154-
n += 1
173+
155174
# Generate a random position being included in a random interval of the target
156175
intervalStart = random.choice(list(target.keys()))
157176
intervalEnd = target[intervalStart]
@@ -162,16 +181,15 @@ def makeTarget(genomeSize, targetSize, numFeatures=1):
162181
else:
163182
seqPosLength[pos] = [readLength]
164183

165-
166-
eprint(f"Generated positions for {totalReads} reads (from {len(seqPosLength)} positions)")
167184

185+
186+
168187
# Sort seqPosLength keys ascending
169188
seqPosLength = {k: v for k, v in sorted(seqPosLength.items())}
170-
eprint(f"Generating {totalReads} reads, {generatedBases} bp")
171189
# Generate reads
172190
with pysam.AlignmentFile(opts.output, "wb", header=bamHeader) as outf:
173191
n = 0
174-
for index, pos in track(enumerate(seqPosLength), total=len(seqPosLength), description="Writing BAM..."):
192+
for index, pos in track(enumerate(seqPosLength), total=len(seqPosLength), description="Writing BAM... "):
175193

176194

177195
for length in seqPosLength[pos]:
@@ -184,9 +202,7 @@ def makeTarget(genomeSize, targetSize, numFeatures=1):
184202
seqString = "A" * length
185203
try:
186204
a = pysam.AlignedSegment()
187-
188-
189-
a.query_sequence = "chr1"
205+
#a.query_sequence = "chr1"
190206
a.flag = 0
191207
a.reference_id = 0
192208
a.reference_start = pos
@@ -208,6 +224,7 @@ def makeTarget(genomeSize, targetSize, numFeatures=1):
208224
raise Exception("Read extends past end of genome")
209225
outf.write(a)
210226

227+
211228
except Exception as e:
212229
eprint("ERROR:", sys.exc_info(), a)
213230
eprint(f"{n} {pos} {seqPosLength[pos]}")

src/bamtocov.nim

Lines changed: 45 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,8 @@ template doAssert(condition: bool, message: string) = # FIXME is this already in
210210
stderr.writeLine("ERROR: ", message)
211211
quit(1)
212212

213+
214+
# COVERAGE FUNCTIONS #
213215
proc newCov(f = 0, r = 0): coverage_t =
214216
coverage_t(forward: f, reverse: r)
215217

@@ -228,12 +230,24 @@ proc dec(c: var coverage_t, reverse=false) =
228230
proc tot(c: coverage_t): int =
229231
c.forward + c.reverse
230232

233+
proc max(c1: coverage_t, c2: coverage_t): coverage_t =
234+
newCov(max(c1.forward, c2.forward), max(c1.reverse, c2.reverse))
235+
proc min(c1: coverage_t, c2: coverage_t): coverage_t =
236+
newCov(min(c1.forward, c2.forward), min(c1.reverse, c2.reverse))
237+
proc `+`(c1: coverage_t, c2: coverage_t): coverage_t =
238+
newCov(c1.forward + c2.forward, c1.reverse + c2.reverse)
239+
240+
proc `/`(c: coverage_t, by: float): tuple[forward: float, reverse: float] =
241+
(float(c.forward)/by, float(c.reverse)/by)
242+
proc `*`(c: coverage_t, by: int): coverage_t =
243+
newCov(c.forward*by, c.reverse*by)
244+
245+
231246
proc topStop(q: HeapQueue): int64 =
232247
if not q[0].isNil:
233248
return q[0].stop
234249
return -1
235250

236-
237251
proc topReverse(q: HeapQueue): bool =
238252
if not q[0].isNil:
239253
return q[0].reverse
@@ -391,6 +405,24 @@ type
391405
chrom2str: TableRef[chrom_t, string]
392406
chrom2len: TableRef[chrom_t, pos_t]
393407

408+
proc output_wig_span(span: genomic_interval_t[coverage_t], opts: output_option_t) =
409+
let span_length = opts.span_length # FIXME the actual span can be less than span_length!
410+
let value_str =
411+
if opts.strand:
412+
case opts.span_func:
413+
of sf_max, sf_min: $span.label.forward & "\t" & $span.label.reverse
414+
of sf_mean:
415+
let mean = span.label/float(span_length)
416+
$mean.forward & "\t" & $mean.reverse
417+
else:
418+
let tot = span.label.forward + span.label.reverse
419+
case opts.span_func:
420+
of sf_max, sf_min: $tot
421+
of sf_mean: $(float(tot)/float(span_length))
422+
echo $span.start & "\t" & value_str
423+
424+
425+
394426
proc write_output(o: var output_t, i: genomic_interval_t[coverage_t]) =
395427
if o.current_span.chrom != i.chrom or i.start < i.stop: # skip empty intervals
396428
case o.opts.output_format:
@@ -412,16 +444,13 @@ proc write_output(o: var output_t, i: genomic_interval_t[coverage_t]) =
412444
if len(o.quantization_index2label) > 0:
413445
stderr.writeLine("wig output does not support quantized coverage")
414446
raise
415-
if o.opts.strand:
416-
stderr.writeLine("wig output does not support stranded coverage")
417-
raise
447+
#if o.opts.strand:
448+
# stderr.writeLine("wig output does not support stranded coverage")
449+
# raise
418450
let span_length = o.opts.span_length
419451
if o.current_span.chrom != i.chrom: # start new contig
420452
if o.current_span.chrom != -1 and o.current_span.start < o.chrom2len[o.current_span.chrom]: # output last possibly incomplete span from previous chrom
421-
let span_value = case o.opts.span_func:
422-
of sf_max, sf_min: $o.current_span.label.forward
423-
of sf_mean: $(float(o.current_span.label.forward)/float(span_length)) # FIXME the actual span is less than span_length!
424-
echo $o.current_span.start & "\t" & span_value
453+
output_wig_span(o.current_span, o.opts)
425454
if i.chrom == -1:
426455
return
427456

@@ -434,22 +463,18 @@ proc write_output(o: var output_t, i: genomic_interval_t[coverage_t]) =
434463
while o.current_span.start <= i.stop:
435464
let inter = intersection_first(o.current_span, i)
436465
if not is_empty(inter): # update the current span value
437-
o.current_span.label.forward = case o.opts.span_func:
438-
of sf_max: max(o.current_span.label.forward, i.label.forward)
439-
of sf_min: min(o.current_span.label.forward, i.label.forward)
440-
of sf_mean: o.current_span.label.forward + i.label.forward*int(len(inter))
466+
o.current_span.label = case o.opts.span_func:
467+
of sf_max: max(o.current_span.label, i.label)
468+
of sf_min: min(o.current_span.label, i.label)
469+
of sf_mean: o.current_span.label + i.label*int(len(inter))
441470
if inter.stop == o.current_span.stop: # span is concluded
442-
# output span
443-
let span_value = case o.opts.span_func:
444-
of sf_max, sf_min: $o.current_span.label.forward
445-
of sf_mean: $(float(o.current_span.label.forward)/float(span_length))
446-
echo $o.current_span.start & "\t" & span_value
471+
output_wig_span(o.current_span, o.opts)
447472
# next span
448473
o.current_span.start += span_length
449474
o.current_span.stop = o.current_span.start + span_length
450-
o.current_span.label.forward = case o.opts.span_func
451-
of sf_max, sf_mean: 0
452-
of sf_min: high(int)
475+
o.current_span.label = case o.opts.span_func
476+
of sf_max, sf_mean: newCov(0, 0)
477+
of sf_min: newCov(high(int), high(int))
453478
else: # span extends beyond the interval, we are done
454479
break
455480

src/covutils.nim

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,36 @@ proc tostring*(r: region_t, s:var string) {.inline.} =
4242
s.add(r.name & "\t")
4343
s.add($r.count)
4444

45+
# Converts a GTF line to region object
46+
proc gtf_line_to_region*(line: string, gffField = "exon", gffSeparator = ";", gffIdentifier = "gene_id"): region_t =
47+
#NC_001422.1 Prodigal:002006 gene 51 221 . + 0 gene_id "nbis-gene-1"; ID "nbis-gene-1"; inference "ab initio prediction:Prodigal:002006"; locus_tag "PhiX_01"; product "hypothetical protein";
48+
var
49+
cse = line.strip().split('\t')
50+
51+
if len(cse) < 8:
52+
stderr.write_line("[warning] skipping GTF line (fields not found):", line.strip())
53+
return nil
54+
55+
# Skip non CDS fields (or user provided)
56+
if cse[2] != gffField:
57+
return nil
4558

59+
var
60+
s = parse_int(cse[3]) - 1
61+
e = parse_int(cse[4])
62+
reg = region_t(chrom: cse[0], start: s, stop: e, count:0)
63+
64+
# In the future, 8th field could be requireed [TODO]
65+
if len(cse) == 9:
66+
for gffAnnotPartRaw in cse[8].split(gffSeparator):
67+
let gffAnnotPart = gffAnnotPartRaw.strip(chars = {'"', '\'', ' '})
68+
if gffAnnotPart.startsWith(gffIdentifier):
69+
try:
70+
reg.name = gffAnnotPart.split("=")[1].strip(chars = {'"', '\'', ' '})
71+
except:
72+
reg.name = gffAnnotPart.split(" ")[1].strip(chars = {'"', '\'', ' '})
73+
break
74+
return reg
4675

4776
# Converts a GFF line to region object
4877
proc gff_line_to_region*(line: string, gffField = "CDS", gffSeparator = ";", gffIdentifier = "ID"): region_t =
@@ -123,6 +152,30 @@ proc bed_to_table*(bed: string): TableRef[string, seq[region_t]] =
123152
return bed_regions
124153

125154

155+
proc gtf_to_table*(bed: string, gffField, gffSeparator, gffIdentifier: string): TableRef[string, seq[region_t]] =
156+
var bed_regions = newTable[string, seq[region_t]]()
157+
var hf = hts.hts_open(cstring(bed), "r")
158+
var kstr: hts.kstring_t
159+
kstr.l = 0
160+
kstr.m = 0
161+
kstr.s = nil
162+
while hts_getline(hf, cint(10), addr kstr) > 0:
163+
if ($kstr.s).startswith("##FASTA"):
164+
break
165+
if $kstr.s[0] == "#":
166+
continue
167+
168+
var v = gtf_line_to_region($kstr.s, gffField, gffSeparator, gffIdentifier)
169+
if v == nil: continue
170+
discard bed_regions.hasKeyOrPut(v.chrom, new_seq[region_t]())
171+
bed_regions[v.chrom].add(v)
172+
173+
# since it is read into mem, can also well sort.
174+
for chrom, ivs in bed_regions.mpairs:
175+
sort(ivs, proc (a, b: region_t): int = a.start - b.start)
176+
177+
hts.free(kstr.s)
178+
return bed_regions
126179

127180
proc gff_to_table*(bed: string, gffField, gffSeparator, gffIdentifier: string): TableRef[string, seq[region_t]] =
128181
var bed_regions = newTable[string, seq[region_t]]()

0 commit comments

Comments
 (0)