Skip to content

Commit a127b08

Browse files
committed
Optimizations
1 parent 518920b commit a127b08

File tree

3 files changed

+126
-169
lines changed

3 files changed

+126
-169
lines changed

src/tabixIndexedFile.ts

Lines changed: 125 additions & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ import Chunk from './chunk.ts'
77
import CSI from './csi.ts'
88
import IndexFile, { IndexData, Options } from './indexFile.ts'
99
import TBI from './tbi.ts'
10-
import { checkAbortSignal } from './util.ts'
1110

1211
import type { GenericFilehandle } from 'generic-filehandle2'
1312

@@ -29,6 +28,7 @@ export default class TabixIndexedFile {
2928
private filehandle: GenericFilehandle
3029
private index: IndexFile
3130
private renameRefSeq: (n: string) => string
31+
private hasCustomRenameRefSeq: boolean
3232
private chunkCache: AbortablePromiseCache<Chunk, ReadChunk>
3333
public cache = new LRU<
3434
string,
@@ -142,6 +142,7 @@ export default class TabixIndexedFile {
142142
}
143143

144144
this.renameRefSeq = renameRefSeqs
145+
this.hasCustomRenameRefSeq = arguments[0]?.renameRefSeqs !== undefined
145146
this.chunkCache = new AbortablePromiseCache<Chunk, ReadChunk>({
146147
cache: new LRU({ maxSize: Math.floor(chunkCacheSize / (1 << 16)) }),
147148
fill: (args: Chunk, signal?: AbortSignal) =>
@@ -196,7 +197,6 @@ export default class TabixIndexedFile {
196197
}
197198

198199
const metadata = await this.index.getMetadata(options)
199-
checkAbortSignal(signal)
200200
const start = s ?? 0
201201
const end = e ?? metadata.maxRefLength
202202
if (!(start <= end)) {
@@ -209,9 +209,23 @@ export default class TabixIndexedFile {
209209
}
210210

211211
const chunks = await this.index.blocksForRange(refName, start, end, options)
212-
checkAbortSignal(signal)
213212
const decoder = new TextDecoder('utf8')
214213

214+
const isVCF = metadata.format === 'VCF'
215+
const columnNumbersEffective = {
216+
ref: metadata.columnNumbers.ref || 0,
217+
start: metadata.columnNumbers.start || 0,
218+
end: isVCF ? 8 : metadata.columnNumbers.end || 0,
219+
}
220+
const maxColumn = Math.max(
221+
columnNumbersEffective.ref,
222+
columnNumbersEffective.start,
223+
columnNumbersEffective.end,
224+
)
225+
const metaCharCode = metadata.metaChar?.charCodeAt(0)
226+
const coordinateOffset = metadata.coordinateType === '1-based-closed' ? -1 : 0
227+
const isIdentityRename = !this.hasCustomRenameRefSeq
228+
215229
// now go through each chunk and parse and filter the lines out of it
216230
for (const c of chunks) {
217231
const { buffer, cpositions, dpositions } = await this.chunkCache.get(
@@ -220,7 +234,6 @@ export default class TabixIndexedFile {
220234
signal,
221235
)
222236

223-
checkAbortSignal(signal)
224237
let blockStart = 0
225238
let pos = 0
226239

@@ -251,15 +264,25 @@ export default class TabixIndexedFile {
251264
}
252265

253266
// filter the line for whether it is within the requested range
254-
const { startCoordinate, overlaps } = this.checkLine(
255-
metadata,
267+
const result = this.checkLine(
256268
refName,
257269
start,
258270
end,
259271
line,
272+
columnNumbersEffective,
273+
maxColumn,
274+
metaCharCode,
275+
coordinateOffset,
276+
isVCF,
277+
isIdentityRename,
260278
)
261279

262-
if (overlaps) {
280+
if (result === null) {
281+
// the lines were overlapping the region, but now have stopped, so we
282+
// must be at the end of the relevant data and we can stop processing
283+
// data now
284+
return
285+
} else if (result !== undefined) {
263286
callback(
264287
line,
265288
this.calculateFileOffset(
@@ -270,11 +293,6 @@ export default class TabixIndexedFile {
270293
c.minv.dataPosition,
271294
),
272295
)
273-
} else if (startCoordinate !== undefined && startCoordinate >= end) {
274-
// the lines were overlapping the region, but now have stopped, so we
275-
// must be at the end of the relevant data and we can stop processing
276-
// data now
277-
return
278296
}
279297
blockStart = n + 1
280298
}
@@ -296,15 +314,25 @@ export default class TabixIndexedFile {
296314
}
297315

298316
// filter the line for whether it is within the requested range
299-
const { startCoordinate, overlaps } = this.checkLine(
300-
metadata,
317+
const result = this.checkLine(
301318
refName,
302319
start,
303320
end,
304321
line,
322+
columnNumbersEffective,
323+
maxColumn,
324+
metaCharCode,
325+
coordinateOffset,
326+
isVCF,
327+
isIdentityRename,
305328
)
306329

307-
if (overlaps) {
330+
if (result === null) {
331+
// the lines were overlapping the region, but now have stopped, so we
332+
// must be at the end of the relevant data and we can stop processing
333+
// data now
334+
return
335+
} else if (result !== undefined) {
308336
callback(
309337
line,
310338
this.calculateFileOffset(
@@ -315,11 +343,6 @@ export default class TabixIndexedFile {
315343
c.minv.dataPosition,
316344
),
317345
)
318-
} else if (startCoordinate !== undefined && startCoordinate >= end) {
319-
// the lines were overlapping the region, but now have stopped, so we
320-
// must be at the end of the relevant data and we can stop processing
321-
// data now
322-
return
323346
}
324347
blockStart = n + 1
325348
}
@@ -339,8 +362,6 @@ export default class TabixIndexedFile {
339362
const { firstDataLine, metaChar, maxBlockSize } =
340363
await this.getMetadata(opts)
341364

342-
checkAbortSignal(opts.signal)
343-
344365
const maxFetch = (firstDataLine?.blockPosition || 0) + maxBlockSize
345366
// TODO: what if we don't have a firstDataLine, and the header actually
346367
// takes up more than one block? this case is not covered here
@@ -391,141 +412,120 @@ export default class TabixIndexedFile {
391412
}
392413

393414
/**
394-
* @param {object} metadata metadata object from the parsed index, containing
395-
* columnNumbers, metaChar, and format
396-
*
397415
* @param {string} regionRefName
398416
*
399417
* @param {number} regionStart region start coordinate (0-based-half-open)
400418
*
401419
* @param {number} regionEnd region end coordinate (0-based-half-open)
402420
*
403-
* @param {array[string]} line
421+
* @param {string} line
422+
*
423+
* @param {object} columnNumbersEffective pre-calculated column numbers
424+
*
425+
* @param {number} maxColumn pre-calculated max column
426+
*
427+
* @param {number} metaCharCode pre-calculated metaChar code
404428
*
405-
* @returns {object} like `{startCoordinate, overlaps}`. overlaps is boolean,
406-
* true if line is a data line that overlaps the given region
429+
* @param {number} coordinateOffset 0 or -1 for coordinate adjustment
430+
*
431+
* @param {boolean} isVCF whether this is VCF format
432+
*
433+
* @param {boolean} isIdentityRename whether renameRefSeq is the identity function
434+
*
435+
* @returns {number | null | undefined} startCoordinate if overlapping, null if should stop processing, undefined otherwise
407436
*/
408437
checkLine(
409-
metadata: IndexData,
410438
regionRefName: string,
411439
regionStart: number,
412440
regionEnd: number,
413441
line: string,
442+
columnNumbersEffective: { ref: number; start: number; end: number },
443+
maxColumn: number,
444+
metaCharCode: number | undefined,
445+
coordinateOffset: number,
446+
isVCF: boolean,
447+
isIdentityRename: boolean,
414448
) {
415-
const { columnNumbers, metaChar, coordinateType, format } = metadata
416-
// skip meta lines
417-
if (metaChar && line.startsWith(metaChar)) {
418-
return { overlaps: false }
449+
if (metaCharCode !== undefined && line.charCodeAt(0) === metaCharCode) {
450+
return undefined
419451
}
420452

421-
// check ref/start/end using column metadata from index
422-
let { ref, start, end } = columnNumbers
423-
if (!ref) {
424-
ref = 0
425-
}
426-
if (!start) {
427-
start = 0
428-
}
429-
if (!end) {
430-
end = 0
431-
}
432-
if (format === 'VCF') {
433-
end = 8
434-
}
435-
const maxColumn = Math.max(ref, start, end)
436-
437-
// this code is kind of complex, but it is fairly fast. basically, we want
438-
// to avoid doing a split, because if the lines are really long that could
439-
// lead to us allocating a bunch of extra memory, which is slow
440-
441-
let currentColumnNumber = 1 // cols are numbered starting at 1 in the index metadata
453+
let currentColumnNumber = 1
442454
let currentColumnStart = 0
443455
let refSeq = ''
444456
let startCoordinate = -Infinity
445457
const l = line.length
446-
for (let i = 0; i < l + 1; i++) {
447-
if (line[i] === '\t' || i === l) {
448-
if (currentColumnNumber === ref) {
449-
if (
450-
this.renameRefSeq(line.slice(currentColumnStart, i)) !==
458+
let tabPos = line.indexOf('\t', currentColumnStart)
459+
460+
while (currentColumnNumber <= maxColumn) {
461+
const columnEnd = tabPos === -1 ? l : tabPos
462+
463+
if (currentColumnNumber === columnNumbersEffective.ref) {
464+
const refMatch = isIdentityRename
465+
? line.slice(currentColumnStart, columnEnd) === regionRefName
466+
: this.renameRefSeq(line.slice(currentColumnStart, columnEnd)) ===
451467
regionRefName
452-
) {
453-
return {
454-
overlaps: false,
455-
}
456-
}
457-
} else if (currentColumnNumber === start) {
458-
startCoordinate = Number.parseInt(
459-
line.slice(currentColumnStart, i),
460-
10,
461-
)
462-
// we convert to 0-based-half-open
463-
if (coordinateType === '1-based-closed') {
464-
startCoordinate -= 1
465-
}
466-
if (startCoordinate >= regionEnd) {
467-
return {
468-
startCoordinate,
469-
overlaps: false,
470-
}
471-
}
472-
if (
473-
(end === 0 || end === start) && // if we have no end, we assume the feature is 1 bp long
474-
startCoordinate + 1 <= regionStart
475-
) {
476-
return {
477-
startCoordinate,
478-
overlaps: false,
479-
}
480-
}
481-
} else if (format === 'VCF' && currentColumnNumber === 4) {
482-
refSeq = line.slice(currentColumnStart, i)
483-
} else if (currentColumnNumber === end) {
484-
// this will never match if there is no end column
485-
const endCoordinate =
486-
format === 'VCF'
487-
? this._getVcfEnd(
488-
startCoordinate,
489-
refSeq,
490-
line.slice(currentColumnStart, i),
491-
)
492-
: Number.parseInt(line.slice(currentColumnStart, i), 10)
493-
if (endCoordinate <= regionStart) {
494-
return {
495-
overlaps: false,
496-
}
497-
}
468+
if (!refMatch) {
469+
return undefined
498470
}
499-
if (currentColumnNumber === maxColumn) {
500-
break
471+
} else if (currentColumnNumber === columnNumbersEffective.start) {
472+
startCoordinate =
473+
Number.parseInt(line.slice(currentColumnStart, columnEnd), 10) +
474+
coordinateOffset
475+
if (startCoordinate >= regionEnd) {
476+
return null
477+
}
478+
if (
479+
(columnNumbersEffective.end === 0 ||
480+
columnNumbersEffective.end === columnNumbersEffective.start) &&
481+
startCoordinate + 1 <= regionStart
482+
) {
483+
return undefined
484+
}
485+
} else if (isVCF && currentColumnNumber === 4) {
486+
refSeq = line.slice(currentColumnStart, columnEnd)
487+
} else if (currentColumnNumber === columnNumbersEffective.end) {
488+
const endCoordinate = isVCF
489+
? this._getVcfEnd(
490+
startCoordinate,
491+
refSeq,
492+
line.slice(currentColumnStart, columnEnd),
493+
)
494+
: Number.parseInt(line.slice(currentColumnStart, columnEnd), 10)
495+
if (endCoordinate <= regionStart) {
496+
return undefined
501497
}
502-
currentColumnStart = i + 1
503-
currentColumnNumber += 1
504498
}
499+
500+
if (currentColumnNumber === maxColumn) {
501+
break
502+
}
503+
504+
currentColumnStart = columnEnd + 1
505+
currentColumnNumber += 1
506+
tabPos = line.indexOf('\t', currentColumnStart)
505507
}
506-
return {
507-
startCoordinate,
508-
overlaps: true,
509-
}
508+
return startCoordinate
510509
}
511510

512511
_getVcfEnd(startCoordinate: number, refSeq: string, info: any) {
513512
let endCoordinate = startCoordinate + refSeq.length
514-
// ignore TRA features as they specify CHR2 and END as being on a different
515-
// chromosome
516-
//
517-
// if CHR2 is on the same chromosome, still ignore it because there should
518-
// be another pairwise feature at the end of this one
519513
const isTRA = info.includes('SVTYPE=TRA')
520-
if (info[0] !== '.' && !isTRA) {
521-
const endRegex = /(?:^|;)END=([^;]+)/
522-
const match = endRegex.exec(info)
523-
if (match) {
524-
endCoordinate = Number.parseInt(match[1]!, 10)
525-
}
526-
} else if (isTRA) {
514+
if (isTRA) {
527515
return startCoordinate + 1
528516
}
517+
518+
if (info[0] !== '.') {
519+
const endIdx = info.indexOf('END=')
520+
if (endIdx !== -1 && (endIdx === 0 || info[endIdx - 1] === ';')) {
521+
const start = endIdx + 4
522+
let end = info.indexOf(';', start)
523+
if (end === -1) {
524+
end = info.length
525+
}
526+
endCoordinate = Number.parseInt(info.slice(start, end), 10)
527+
}
528+
}
529529
return endCoordinate
530530
}
531531

src/tbi.ts

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import { unzip } from '@gmod/bgzf-filehandle'
33
import Chunk from './chunk.ts'
44
import IndexFile, { Options } from './indexFile.ts'
55
import { longFromBytesToUnsigned } from './long.ts'
6-
import { checkAbortSignal, optimizeChunks } from './util.ts'
6+
import { optimizeChunks } from './util.ts'
77
import VirtualOffset, { fromBytes } from './virtualOffset.ts'
88

99
const TBI_MAGIC = 21578324 // TBI\1
@@ -44,7 +44,6 @@ export default class TabixIndex extends IndexFile {
4444
async _parse(opts: Options = {}) {
4545
const buf = await this.filehandle.readFile(opts)
4646
const bytes = await unzip(buf)
47-
checkAbortSignal(opts.signal)
4847
const dataView = new DataView(bytes.buffer)
4948

5049
const magic = dataView.getUint32(0, true)

0 commit comments

Comments
 (0)