Skip to content

Commit c7bd4b5

Browse files
committed
Optimizations
1 parent 518920b commit c7bd4b5

File tree

3 files changed

+123
-170
lines changed

3 files changed

+123
-170
lines changed

src/tabixIndexedFile.ts

Lines changed: 122 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ import Chunk from './chunk.ts'
77
import CSI from './csi.ts'
88
import IndexFile, { IndexData, Options } from './indexFile.ts'
99
import TBI from './tbi.ts'
10-
import { checkAbortSignal } from './util.ts'
1110

1211
import type { GenericFilehandle } from 'generic-filehandle2'
1312

@@ -196,7 +195,6 @@ export default class TabixIndexedFile {
196195
}
197196

198197
const metadata = await this.index.getMetadata(options)
199-
checkAbortSignal(signal)
200198
const start = s ?? 0
201199
const end = e ?? metadata.maxRefLength
202200
if (!(start <= end)) {
@@ -209,9 +207,22 @@ export default class TabixIndexedFile {
209207
}
210208

211209
const chunks = await this.index.blocksForRange(refName, start, end, options)
212-
checkAbortSignal(signal)
213210
const decoder = new TextDecoder('utf8')
214211

212+
const isVCF = metadata.format === 'VCF'
213+
const columnNumbersEffective = {
214+
ref: metadata.columnNumbers.ref || 0,
215+
start: metadata.columnNumbers.start || 0,
216+
end: isVCF ? 8 : metadata.columnNumbers.end || 0,
217+
}
218+
const maxColumn = Math.max(
219+
columnNumbersEffective.ref,
220+
columnNumbersEffective.start,
221+
columnNumbersEffective.end,
222+
)
223+
const metaCharCode = metadata.metaChar?.charCodeAt(0)
224+
const coordinateType = metadata.coordinateType
225+
215226
// now go through each chunk and parse and filter the lines out of it
216227
for (const c of chunks) {
217228
const { buffer, cpositions, dpositions } = await this.chunkCache.get(
@@ -220,7 +231,6 @@ export default class TabixIndexedFile {
220231
signal,
221232
)
222233

223-
checkAbortSignal(signal)
224234
let blockStart = 0
225235
let pos = 0
226236

@@ -251,15 +261,24 @@ export default class TabixIndexedFile {
251261
}
252262

253263
// filter the line for whether it is within the requested range
254-
const { startCoordinate, overlaps } = this.checkLine(
255-
metadata,
264+
const result = this.checkLine(
256265
refName,
257266
start,
258267
end,
259268
line,
269+
columnNumbersEffective,
270+
maxColumn,
271+
metaCharCode,
272+
coordinateType,
273+
isVCF,
260274
)
261275

262-
if (overlaps) {
276+
if (result === null) {
277+
// the lines were overlapping the region, but now have stopped, so we
278+
// must be at the end of the relevant data and we can stop processing
279+
// data now
280+
return
281+
} else if (result !== undefined) {
263282
callback(
264283
line,
265284
this.calculateFileOffset(
@@ -270,15 +289,11 @@ export default class TabixIndexedFile {
270289
c.minv.dataPosition,
271290
),
272291
)
273-
} else if (startCoordinate !== undefined && startCoordinate >= end) {
274-
// the lines were overlapping the region, but now have stopped, so we
275-
// must be at the end of the relevant data and we can stop processing
276-
// data now
277-
return
278292
}
279293
blockStart = n + 1
280294
}
281295
} else {
296+
console.log('not ascii?')
282297
while (blockStart < buffer.length) {
283298
const n = buffer.indexOf('\n'.charCodeAt(0), blockStart)
284299
if (n === -1) {
@@ -296,15 +311,24 @@ export default class TabixIndexedFile {
296311
}
297312

298313
// filter the line for whether it is within the requested range
299-
const { startCoordinate, overlaps } = this.checkLine(
300-
metadata,
314+
const result = this.checkLine(
301315
refName,
302316
start,
303317
end,
304318
line,
319+
columnNumbersEffective,
320+
maxColumn,
321+
metaCharCode,
322+
coordinateType,
323+
isVCF,
305324
)
306325

307-
if (overlaps) {
326+
if (result === null) {
327+
// the lines were overlapping the region, but now have stopped, so we
328+
// must be at the end of the relevant data and we can stop processing
329+
// data now
330+
return
331+
} else if (result !== undefined) {
308332
callback(
309333
line,
310334
this.calculateFileOffset(
@@ -315,11 +339,6 @@ export default class TabixIndexedFile {
315339
c.minv.dataPosition,
316340
),
317341
)
318-
} else if (startCoordinate !== undefined && startCoordinate >= end) {
319-
// the lines were overlapping the region, but now have stopped, so we
320-
// must be at the end of the relevant data and we can stop processing
321-
// data now
322-
return
323342
}
324343
blockStart = n + 1
325344
}
@@ -339,8 +358,6 @@ export default class TabixIndexedFile {
339358
const { firstDataLine, metaChar, maxBlockSize } =
340359
await this.getMetadata(opts)
341360

342-
checkAbortSignal(opts.signal)
343-
344361
const maxFetch = (firstDataLine?.blockPosition || 0) + maxBlockSize
345362
// TODO: what if we don't have a firstDataLine, and the header actually
346363
// takes up more than one block? this case is not covered here
@@ -391,141 +408,120 @@ export default class TabixIndexedFile {
391408
}
392409

393410
/**
394-
* @param {object} metadata metadata object from the parsed index, containing
395-
* columnNumbers, metaChar, and format
396-
*
397411
* @param {string} regionRefName
398412
*
399413
* @param {number} regionStart region start coordinate (0-based-half-open)
400414
*
401415
* @param {number} regionEnd region end coordinate (0-based-half-open)
402416
*
403-
* @param {array[string]} line
417+
* @param {string} line
418+
*
419+
* @param {object} columnNumbersEffective pre-calculated column numbers
420+
*
421+
* @param {number} maxColumn pre-calculated max column
422+
*
423+
* @param {number} metaCharCode pre-calculated metaChar code
404424
*
405-
* @returns {object} like `{startCoordinate, overlaps}`. overlaps is boolean,
406-
* true if line is a data line that overlaps the given region
425+
* @param {string} coordinateType coordinate type from metadata
426+
*
427+
* @param {boolean} isVCF whether this is VCF format
428+
*
429+
* @returns {number | null | undefined} startCoordinate if overlapping, null if should stop processing, undefined otherwise
407430
*/
408431
checkLine(
409-
metadata: IndexData,
410432
regionRefName: string,
411433
regionStart: number,
412434
regionEnd: number,
413435
line: string,
436+
columnNumbersEffective: { ref: number; start: number; end: number },
437+
maxColumn: number,
438+
metaCharCode: number | undefined,
439+
coordinateType: string,
440+
isVCF: boolean,
414441
) {
415-
const { columnNumbers, metaChar, coordinateType, format } = metadata
416-
// skip meta lines
417-
if (metaChar && line.startsWith(metaChar)) {
418-
return { overlaps: false }
419-
}
420-
421-
// check ref/start/end using column metadata from index
422-
let { ref, start, end } = columnNumbers
423-
if (!ref) {
424-
ref = 0
425-
}
426-
if (!start) {
427-
start = 0
428-
}
429-
if (!end) {
430-
end = 0
431-
}
432-
if (format === 'VCF') {
433-
end = 8
442+
if (metaCharCode !== undefined && line.charCodeAt(0) === metaCharCode) {
443+
return undefined
434444
}
435-
const maxColumn = Math.max(ref, start, end)
436445

437-
// this code is kind of complex, but it is fairly fast. basically, we want
438-
// to avoid doing a split, because if the lines are really long that could
439-
// lead to us allocating a bunch of extra memory, which is slow
440-
441-
let currentColumnNumber = 1 // cols are numbered starting at 1 in the index metadata
446+
let currentColumnNumber = 1
442447
let currentColumnStart = 0
443448
let refSeq = ''
444449
let startCoordinate = -Infinity
445450
const l = line.length
446-
for (let i = 0; i < l + 1; i++) {
447-
if (line[i] === '\t' || i === l) {
448-
if (currentColumnNumber === ref) {
449-
if (
450-
this.renameRefSeq(line.slice(currentColumnStart, i)) !==
451-
regionRefName
452-
) {
453-
return {
454-
overlaps: false,
455-
}
456-
}
457-
} else if (currentColumnNumber === start) {
458-
startCoordinate = Number.parseInt(
459-
line.slice(currentColumnStart, i),
460-
10,
461-
)
462-
// we convert to 0-based-half-open
463-
if (coordinateType === '1-based-closed') {
464-
startCoordinate -= 1
465-
}
466-
if (startCoordinate >= regionEnd) {
467-
return {
468-
startCoordinate,
469-
overlaps: false,
470-
}
471-
}
472-
if (
473-
(end === 0 || end === start) && // if we have no end, we assume the feature is 1 bp long
474-
startCoordinate + 1 <= regionStart
475-
) {
476-
return {
477-
startCoordinate,
478-
overlaps: false,
479-
}
480-
}
481-
} else if (format === 'VCF' && currentColumnNumber === 4) {
482-
refSeq = line.slice(currentColumnStart, i)
483-
} else if (currentColumnNumber === end) {
484-
// this will never match if there is no end column
485-
const endCoordinate =
486-
format === 'VCF'
487-
? this._getVcfEnd(
488-
startCoordinate,
489-
refSeq,
490-
line.slice(currentColumnStart, i),
491-
)
492-
: Number.parseInt(line.slice(currentColumnStart, i), 10)
493-
if (endCoordinate <= regionStart) {
494-
return {
495-
overlaps: false,
496-
}
497-
}
451+
let tabPos = line.indexOf('\t', currentColumnStart)
452+
453+
while (currentColumnNumber <= maxColumn) {
454+
const columnEnd = tabPos === -1 ? l : tabPos
455+
456+
if (currentColumnNumber === columnNumbersEffective.ref) {
457+
if (
458+
this.renameRefSeq(line.slice(currentColumnStart, columnEnd)) !==
459+
regionRefName
460+
) {
461+
return undefined
498462
}
499-
if (currentColumnNumber === maxColumn) {
500-
break
463+
} else if (currentColumnNumber === columnNumbersEffective.start) {
464+
startCoordinate = Number.parseInt(
465+
line.slice(currentColumnStart, columnEnd),
466+
10,
467+
)
468+
if (coordinateType === '1-based-closed') {
469+
startCoordinate -= 1
470+
}
471+
if (startCoordinate >= regionEnd) {
472+
return null
473+
}
474+
if (
475+
(columnNumbersEffective.end === 0 ||
476+
columnNumbersEffective.end === columnNumbersEffective.start) &&
477+
startCoordinate + 1 <= regionStart
478+
) {
479+
return undefined
480+
}
481+
} else if (isVCF && currentColumnNumber === 4) {
482+
refSeq = line.slice(currentColumnStart, columnEnd)
483+
} else if (currentColumnNumber === columnNumbersEffective.end) {
484+
const endCoordinate = isVCF
485+
? this._getVcfEnd(
486+
startCoordinate,
487+
refSeq,
488+
line.slice(currentColumnStart, columnEnd),
489+
)
490+
: Number.parseInt(line.slice(currentColumnStart, columnEnd), 10)
491+
if (endCoordinate <= regionStart) {
492+
return undefined
501493
}
502-
currentColumnStart = i + 1
503-
currentColumnNumber += 1
504494
}
495+
496+
if (currentColumnNumber === maxColumn) {
497+
break
498+
}
499+
500+
currentColumnStart = columnEnd + 1
501+
currentColumnNumber += 1
502+
tabPos = line.indexOf('\t', currentColumnStart)
505503
}
506-
return {
507-
startCoordinate,
508-
overlaps: true,
509-
}
504+
return startCoordinate
510505
}
511506

512507
_getVcfEnd(startCoordinate: number, refSeq: string, info: any) {
513508
let endCoordinate = startCoordinate + refSeq.length
514-
// ignore TRA features as they specify CHR2 and END as being on a different
515-
// chromosome
516-
//
517-
// if CHR2 is on the same chromosome, still ignore it because there should
518-
// be another pairwise feature at the end of this one
519509
const isTRA = info.includes('SVTYPE=TRA')
520-
if (info[0] !== '.' && !isTRA) {
521-
const endRegex = /(?:^|;)END=([^;]+)/
522-
const match = endRegex.exec(info)
523-
if (match) {
524-
endCoordinate = Number.parseInt(match[1]!, 10)
525-
}
526-
} else if (isTRA) {
510+
if (isTRA) {
527511
return startCoordinate + 1
528512
}
513+
514+
if (info[0] !== '.') {
515+
const endIdx = info.indexOf('END=')
516+
if (endIdx !== -1 && (endIdx === 0 || info[endIdx - 1] === ';')) {
517+
const start = endIdx + 4
518+
let end = info.indexOf(';', start)
519+
if (end === -1) {
520+
end = info.length
521+
}
522+
endCoordinate = Number.parseInt(info.slice(start, end), 10)
523+
}
524+
}
529525
return endCoordinate
530526
}
531527

src/tbi.ts

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import { unzip } from '@gmod/bgzf-filehandle'
33
import Chunk from './chunk.ts'
44
import IndexFile, { Options } from './indexFile.ts'
55
import { longFromBytesToUnsigned } from './long.ts'
6-
import { checkAbortSignal, optimizeChunks } from './util.ts'
6+
import { optimizeChunks } from './util.ts'
77
import VirtualOffset, { fromBytes } from './virtualOffset.ts'
88

99
const TBI_MAGIC = 21578324 // TBI\1
@@ -44,7 +44,6 @@ export default class TabixIndex extends IndexFile {
4444
async _parse(opts: Options = {}) {
4545
const buf = await this.filehandle.readFile(opts)
4646
const bytes = await unzip(buf)
47-
checkAbortSignal(opts.signal)
4847
const dataView = new DataView(bytes.buffer)
4948

5049
const magic = dataView.getUint32(0, true)

0 commit comments

Comments
 (0)