Skip to content

Commit 38d43b5

Browse files
committed
Optimizations
1 parent 518920b commit 38d43b5

File tree

3 files changed

+122
-170
lines changed

3 files changed

+122
-170
lines changed

src/tabixIndexedFile.ts

Lines changed: 121 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ import Chunk from './chunk.ts'
77
import CSI from './csi.ts'
88
import IndexFile, { IndexData, Options } from './indexFile.ts'
99
import TBI from './tbi.ts'
10-
import { checkAbortSignal } from './util.ts'
1110

1211
import type { GenericFilehandle } from 'generic-filehandle2'
1312

@@ -196,7 +195,6 @@ export default class TabixIndexedFile {
196195
}
197196

198197
const metadata = await this.index.getMetadata(options)
199-
checkAbortSignal(signal)
200198
const start = s ?? 0
201199
const end = e ?? metadata.maxRefLength
202200
if (!(start <= end)) {
@@ -209,9 +207,22 @@ export default class TabixIndexedFile {
209207
}
210208

211209
const chunks = await this.index.blocksForRange(refName, start, end, options)
212-
checkAbortSignal(signal)
213210
const decoder = new TextDecoder('utf8')
214211

212+
const isVCF = metadata.format === 'VCF'
213+
const columnNumbersEffective = {
214+
ref: metadata.columnNumbers.ref || 0,
215+
start: metadata.columnNumbers.start || 0,
216+
end: isVCF ? 8 : metadata.columnNumbers.end || 0,
217+
}
218+
const maxColumn = Math.max(
219+
columnNumbersEffective.ref,
220+
columnNumbersEffective.start,
221+
columnNumbersEffective.end,
222+
)
223+
const metaCharCode = metadata.metaChar?.charCodeAt(0)
224+
const coordinateType = metadata.coordinateType
225+
215226
// now go through each chunk and parse and filter the lines out of it
216227
for (const c of chunks) {
217228
const { buffer, cpositions, dpositions } = await this.chunkCache.get(
@@ -220,7 +231,6 @@ export default class TabixIndexedFile {
220231
signal,
221232
)
222233

223-
checkAbortSignal(signal)
224234
let blockStart = 0
225235
let pos = 0
226236

@@ -251,15 +261,24 @@ export default class TabixIndexedFile {
251261
}
252262

253263
// filter the line for whether it is within the requested range
254-
const { startCoordinate, overlaps } = this.checkLine(
255-
metadata,
264+
const result = this.checkLine(
256265
refName,
257266
start,
258267
end,
259268
line,
269+
columnNumbersEffective,
270+
maxColumn,
271+
metaCharCode,
272+
coordinateType,
273+
isVCF,
260274
)
261275

262-
if (overlaps) {
276+
if (result === null) {
277+
// the lines were overlapping the region, but now have stopped, so we
278+
// must be at the end of the relevant data and we can stop processing
279+
// data now
280+
return
281+
} else if (result !== undefined) {
263282
callback(
264283
line,
265284
this.calculateFileOffset(
@@ -270,11 +289,6 @@ export default class TabixIndexedFile {
270289
c.minv.dataPosition,
271290
),
272291
)
273-
} else if (startCoordinate !== undefined && startCoordinate >= end) {
274-
// the lines were overlapping the region, but now have stopped, so we
275-
// must be at the end of the relevant data and we can stop processing
276-
// data now
277-
return
278292
}
279293
blockStart = n + 1
280294
}
@@ -296,15 +310,24 @@ export default class TabixIndexedFile {
296310
}
297311

298312
// filter the line for whether it is within the requested range
299-
const { startCoordinate, overlaps } = this.checkLine(
300-
metadata,
313+
const result = this.checkLine(
301314
refName,
302315
start,
303316
end,
304317
line,
318+
columnNumbersEffective,
319+
maxColumn,
320+
metaCharCode,
321+
coordinateType,
322+
isVCF,
305323
)
306324

307-
if (overlaps) {
325+
if (result === null) {
326+
// the lines were overlapping the region, but now have stopped, so we
327+
// must be at the end of the relevant data and we can stop processing
328+
// data now
329+
return
330+
} else if (result !== undefined) {
308331
callback(
309332
line,
310333
this.calculateFileOffset(
@@ -315,11 +338,6 @@ export default class TabixIndexedFile {
315338
c.minv.dataPosition,
316339
),
317340
)
318-
} else if (startCoordinate !== undefined && startCoordinate >= end) {
319-
// the lines were overlapping the region, but now have stopped, so we
320-
// must be at the end of the relevant data and we can stop processing
321-
// data now
322-
return
323341
}
324342
blockStart = n + 1
325343
}
@@ -339,8 +357,6 @@ export default class TabixIndexedFile {
339357
const { firstDataLine, metaChar, maxBlockSize } =
340358
await this.getMetadata(opts)
341359

342-
checkAbortSignal(opts.signal)
343-
344360
const maxFetch = (firstDataLine?.blockPosition || 0) + maxBlockSize
345361
// TODO: what if we don't have a firstDataLine, and the header actually
346362
// takes up more than one block? this case is not covered here
@@ -391,141 +407,120 @@ export default class TabixIndexedFile {
391407
}
392408

393409
/**
394-
* @param {object} metadata metadata object from the parsed index, containing
395-
* columnNumbers, metaChar, and format
396-
*
397410
* @param {string} regionRefName
398411
*
399412
* @param {number} regionStart region start coordinate (0-based-half-open)
400413
*
401414
* @param {number} regionEnd region end coordinate (0-based-half-open)
402415
*
403-
* @param {array[string]} line
416+
* @param {string} line
417+
*
418+
* @param {object} columnNumbersEffective pre-calculated column numbers
419+
*
420+
* @param {number} maxColumn pre-calculated max column
421+
*
422+
* @param {number} metaCharCode pre-calculated metaChar code
404423
*
405-
* @returns {object} like `{startCoordinate, overlaps}`. overlaps is boolean,
406-
* true if line is a data line that overlaps the given region
424+
* @param {string} coordinateType coordinate type from metadata
425+
*
426+
* @param {boolean} isVCF whether this is VCF format
427+
*
428+
* @returns {number | null | undefined} startCoordinate if overlapping, null if should stop processing, undefined otherwise
407429
*/
408430
checkLine(
409-
metadata: IndexData,
410431
regionRefName: string,
411432
regionStart: number,
412433
regionEnd: number,
413434
line: string,
435+
columnNumbersEffective: { ref: number; start: number; end: number },
436+
maxColumn: number,
437+
metaCharCode: number | undefined,
438+
coordinateType: string,
439+
isVCF: boolean,
414440
) {
415-
const { columnNumbers, metaChar, coordinateType, format } = metadata
416-
// skip meta lines
417-
if (metaChar && line.startsWith(metaChar)) {
418-
return { overlaps: false }
419-
}
420-
421-
// check ref/start/end using column metadata from index
422-
let { ref, start, end } = columnNumbers
423-
if (!ref) {
424-
ref = 0
425-
}
426-
if (!start) {
427-
start = 0
428-
}
429-
if (!end) {
430-
end = 0
431-
}
432-
if (format === 'VCF') {
433-
end = 8
441+
if (metaCharCode !== undefined && line.charCodeAt(0) === metaCharCode) {
442+
return undefined
434443
}
435-
const maxColumn = Math.max(ref, start, end)
436444

437-
// this code is kind of complex, but it is fairly fast. basically, we want
438-
// to avoid doing a split, because if the lines are really long that could
439-
// lead to us allocating a bunch of extra memory, which is slow
440-
441-
let currentColumnNumber = 1 // cols are numbered starting at 1 in the index metadata
445+
let currentColumnNumber = 1
442446
let currentColumnStart = 0
443447
let refSeq = ''
444448
let startCoordinate = -Infinity
445449
const l = line.length
446-
for (let i = 0; i < l + 1; i++) {
447-
if (line[i] === '\t' || i === l) {
448-
if (currentColumnNumber === ref) {
449-
if (
450-
this.renameRefSeq(line.slice(currentColumnStart, i)) !==
451-
regionRefName
452-
) {
453-
return {
454-
overlaps: false,
455-
}
456-
}
457-
} else if (currentColumnNumber === start) {
458-
startCoordinate = Number.parseInt(
459-
line.slice(currentColumnStart, i),
460-
10,
461-
)
462-
// we convert to 0-based-half-open
463-
if (coordinateType === '1-based-closed') {
464-
startCoordinate -= 1
465-
}
466-
if (startCoordinate >= regionEnd) {
467-
return {
468-
startCoordinate,
469-
overlaps: false,
470-
}
471-
}
472-
if (
473-
(end === 0 || end === start) && // if we have no end, we assume the feature is 1 bp long
474-
startCoordinate + 1 <= regionStart
475-
) {
476-
return {
477-
startCoordinate,
478-
overlaps: false,
479-
}
480-
}
481-
} else if (format === 'VCF' && currentColumnNumber === 4) {
482-
refSeq = line.slice(currentColumnStart, i)
483-
} else if (currentColumnNumber === end) {
484-
// this will never match if there is no end column
485-
const endCoordinate =
486-
format === 'VCF'
487-
? this._getVcfEnd(
488-
startCoordinate,
489-
refSeq,
490-
line.slice(currentColumnStart, i),
491-
)
492-
: Number.parseInt(line.slice(currentColumnStart, i), 10)
493-
if (endCoordinate <= regionStart) {
494-
return {
495-
overlaps: false,
496-
}
497-
}
450+
let tabPos = line.indexOf('\t', currentColumnStart)
451+
452+
while (currentColumnNumber <= maxColumn) {
453+
const columnEnd = tabPos === -1 ? l : tabPos
454+
455+
if (currentColumnNumber === columnNumbersEffective.ref) {
456+
if (
457+
this.renameRefSeq(line.slice(currentColumnStart, columnEnd)) !==
458+
regionRefName
459+
) {
460+
return undefined
498461
}
499-
if (currentColumnNumber === maxColumn) {
500-
break
462+
} else if (currentColumnNumber === columnNumbersEffective.start) {
463+
startCoordinate = Number.parseInt(
464+
line.slice(currentColumnStart, columnEnd),
465+
10,
466+
)
467+
if (coordinateType === '1-based-closed') {
468+
startCoordinate -= 1
469+
}
470+
if (startCoordinate >= regionEnd) {
471+
return null
472+
}
473+
if (
474+
(columnNumbersEffective.end === 0 ||
475+
columnNumbersEffective.end === columnNumbersEffective.start) &&
476+
startCoordinate + 1 <= regionStart
477+
) {
478+
return undefined
479+
}
480+
} else if (isVCF && currentColumnNumber === 4) {
481+
refSeq = line.slice(currentColumnStart, columnEnd)
482+
} else if (currentColumnNumber === columnNumbersEffective.end) {
483+
const endCoordinate = isVCF
484+
? this._getVcfEnd(
485+
startCoordinate,
486+
refSeq,
487+
line.slice(currentColumnStart, columnEnd),
488+
)
489+
: Number.parseInt(line.slice(currentColumnStart, columnEnd), 10)
490+
if (endCoordinate <= regionStart) {
491+
return undefined
501492
}
502-
currentColumnStart = i + 1
503-
currentColumnNumber += 1
504493
}
494+
495+
if (currentColumnNumber === maxColumn) {
496+
break
497+
}
498+
499+
currentColumnStart = columnEnd + 1
500+
currentColumnNumber += 1
501+
tabPos = line.indexOf('\t', currentColumnStart)
505502
}
506-
return {
507-
startCoordinate,
508-
overlaps: true,
509-
}
503+
return startCoordinate
510504
}
511505

512506
_getVcfEnd(startCoordinate: number, refSeq: string, info: any) {
513507
let endCoordinate = startCoordinate + refSeq.length
514-
// ignore TRA features as they specify CHR2 and END as being on a different
515-
// chromosome
516-
//
517-
// if CHR2 is on the same chromosome, still ignore it because there should
518-
// be another pairwise feature at the end of this one
519508
const isTRA = info.includes('SVTYPE=TRA')
520-
if (info[0] !== '.' && !isTRA) {
521-
const endRegex = /(?:^|;)END=([^;]+)/
522-
const match = endRegex.exec(info)
523-
if (match) {
524-
endCoordinate = Number.parseInt(match[1]!, 10)
525-
}
526-
} else if (isTRA) {
509+
if (isTRA) {
527510
return startCoordinate + 1
528511
}
512+
513+
if (info[0] !== '.') {
514+
const endIdx = info.indexOf('END=')
515+
if (endIdx !== -1 && (endIdx === 0 || info[endIdx - 1] === ';')) {
516+
const start = endIdx + 4
517+
let end = info.indexOf(';', start)
518+
if (end === -1) {
519+
end = info.length
520+
}
521+
endCoordinate = Number.parseInt(info.slice(start, end), 10)
522+
}
523+
}
529524
return endCoordinate
530525
}
531526

src/tbi.ts

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import { unzip } from '@gmod/bgzf-filehandle'
33
import Chunk from './chunk.ts'
44
import IndexFile, { Options } from './indexFile.ts'
55
import { longFromBytesToUnsigned } from './long.ts'
6-
import { checkAbortSignal, optimizeChunks } from './util.ts'
6+
import { optimizeChunks } from './util.ts'
77
import VirtualOffset, { fromBytes } from './virtualOffset.ts'
88

99
const TBI_MAGIC = 21578324 // TBI\1
@@ -44,7 +44,6 @@ export default class TabixIndex extends IndexFile {
4444
async _parse(opts: Options = {}) {
4545
const buf = await this.filehandle.readFile(opts)
4646
const bytes = await unzip(buf)
47-
checkAbortSignal(opts.signal)
4847
const dataView = new DataView(bytes.buffer)
4948

5049
const magic = dataView.getUint32(0, true)

0 commit comments

Comments
 (0)