@@ -7,7 +7,6 @@ import Chunk from './chunk.ts'
77import CSI from './csi.ts'
88import IndexFile , { IndexData , Options } from './indexFile.ts'
99import TBI from './tbi.ts'
10- import { checkAbortSignal } from './util.ts'
1110
1211import type { GenericFilehandle } from 'generic-filehandle2'
1312
@@ -29,6 +28,7 @@ export default class TabixIndexedFile {
2928 private filehandle : GenericFilehandle
3029 private index : IndexFile
3130 private renameRefSeq : ( n : string ) => string
31+ private hasCustomRenameRefSeq : boolean
3232 private chunkCache : AbortablePromiseCache < Chunk , ReadChunk >
3333 public cache = new LRU <
3434 string ,
@@ -142,6 +142,7 @@ export default class TabixIndexedFile {
142142 }
143143
144144 this . renameRefSeq = renameRefSeqs
145+ this . hasCustomRenameRefSeq = arguments [ 0 ] ?. renameRefSeqs !== undefined
145146 this . chunkCache = new AbortablePromiseCache < Chunk , ReadChunk > ( {
146147 cache : new LRU ( { maxSize : Math . floor ( chunkCacheSize / ( 1 << 16 ) ) } ) ,
147148 fill : ( args : Chunk , signal ?: AbortSignal ) =>
@@ -196,7 +197,6 @@ export default class TabixIndexedFile {
196197 }
197198
198199 const metadata = await this . index . getMetadata ( options )
199- checkAbortSignal ( signal )
200200 const start = s ?? 0
201201 const end = e ?? metadata . maxRefLength
202202 if ( ! ( start <= end ) ) {
@@ -209,9 +209,23 @@ export default class TabixIndexedFile {
209209 }
210210
211211 const chunks = await this . index . blocksForRange ( refName , start , end , options )
212- checkAbortSignal ( signal )
213212 const decoder = new TextDecoder ( 'utf8' )
214213
214+ const isVCF = metadata . format === 'VCF'
215+ const columnNumbersEffective = {
216+ ref : metadata . columnNumbers . ref || 0 ,
217+ start : metadata . columnNumbers . start || 0 ,
218+ end : isVCF ? 8 : metadata . columnNumbers . end || 0 ,
219+ }
220+ const maxColumn = Math . max (
221+ columnNumbersEffective . ref ,
222+ columnNumbersEffective . start ,
223+ columnNumbersEffective . end ,
224+ )
225+ const metaCharCode = metadata . metaChar ?. charCodeAt ( 0 )
226+ const coordinateOffset = metadata . coordinateType === '1-based-closed' ? - 1 : 0
227+ const isIdentityRename = ! this . hasCustomRenameRefSeq
228+
215229 // now go through each chunk and parse and filter the lines out of it
216230 for ( const c of chunks ) {
217231 const { buffer, cpositions, dpositions } = await this . chunkCache . get (
@@ -220,7 +234,6 @@ export default class TabixIndexedFile {
220234 signal ,
221235 )
222236
223- checkAbortSignal ( signal )
224237 let blockStart = 0
225238 let pos = 0
226239
@@ -251,15 +264,25 @@ export default class TabixIndexedFile {
251264 }
252265
253266 // filter the line for whether it is within the requested range
254- const { startCoordinate, overlaps } = this . checkLine (
255- metadata ,
267+ const result = this . checkLine (
256268 refName ,
257269 start ,
258270 end ,
259271 line ,
272+ columnNumbersEffective ,
273+ maxColumn ,
274+ metaCharCode ,
275+ coordinateOffset ,
276+ isVCF ,
277+ isIdentityRename ,
260278 )
261279
262- if ( overlaps ) {
280+ if ( result === null ) {
281+ // the lines were overlapping the region, but now have stopped, so we
282+ // must be at the end of the relevant data and we can stop processing
283+ // data now
284+ return
285+ } else if ( result !== undefined ) {
263286 callback (
264287 line ,
265288 this . calculateFileOffset (
@@ -270,11 +293,6 @@ export default class TabixIndexedFile {
270293 c . minv . dataPosition ,
271294 ) ,
272295 )
273- } else if ( startCoordinate !== undefined && startCoordinate >= end ) {
274- // the lines were overlapping the region, but now have stopped, so we
275- // must be at the end of the relevant data and we can stop processing
276- // data now
277- return
278296 }
279297 blockStart = n + 1
280298 }
@@ -296,15 +314,25 @@ export default class TabixIndexedFile {
296314 }
297315
298316 // filter the line for whether it is within the requested range
299- const { startCoordinate, overlaps } = this . checkLine (
300- metadata ,
317+ const result = this . checkLine (
301318 refName ,
302319 start ,
303320 end ,
304321 line ,
322+ columnNumbersEffective ,
323+ maxColumn ,
324+ metaCharCode ,
325+ coordinateOffset ,
326+ isVCF ,
327+ isIdentityRename ,
305328 )
306329
307- if ( overlaps ) {
330+ if ( result === null ) {
331+ // the lines were overlapping the region, but now have stopped, so we
332+ // must be at the end of the relevant data and we can stop processing
333+ // data now
334+ return
335+ } else if ( result !== undefined ) {
308336 callback (
309337 line ,
310338 this . calculateFileOffset (
@@ -315,11 +343,6 @@ export default class TabixIndexedFile {
315343 c . minv . dataPosition ,
316344 ) ,
317345 )
318- } else if ( startCoordinate !== undefined && startCoordinate >= end ) {
319- // the lines were overlapping the region, but now have stopped, so we
320- // must be at the end of the relevant data and we can stop processing
321- // data now
322- return
323346 }
324347 blockStart = n + 1
325348 }
@@ -339,8 +362,6 @@ export default class TabixIndexedFile {
339362 const { firstDataLine, metaChar, maxBlockSize } =
340363 await this . getMetadata ( opts )
341364
342- checkAbortSignal ( opts . signal )
343-
344365 const maxFetch = ( firstDataLine ?. blockPosition || 0 ) + maxBlockSize
345366 // TODO: what if we don't have a firstDataLine, and the header actually
346367 // takes up more than one block? this case is not covered here
@@ -391,141 +412,120 @@ export default class TabixIndexedFile {
391412 }
392413
393414 /**
394- * @param {object } metadata metadata object from the parsed index, containing
395- * columnNumbers, metaChar, and format
396- *
397415 * @param {string } regionRefName
398416 *
399417 * @param {number } regionStart region start coordinate (0-based-half-open)
400418 *
401419 * @param {number } regionEnd region end coordinate (0-based-half-open)
402420 *
403- * @param {array[string] } line
421+ * @param {string } line
422+ *
423+ * @param {object } columnNumbersEffective pre-calculated column numbers
424+ *
425+ * @param {number } maxColumn pre-calculated max column
426+ *
427+ * @param {number } metaCharCode pre-calculated metaChar code
404428 *
405- * @returns {object } like `{startCoordinate, overlaps}`. overlaps is boolean,
406- * true if line is a data line that overlaps the given region
429+ * @param {number } coordinateOffset 0 or -1 for coordinate adjustment
430+ *
431+ * @param {boolean } isVCF whether this is VCF format
432+ *
433+ * @param {boolean } isIdentityRename whether renameRefSeq is the identity function
434+ *
435+ * @returns {number | null | undefined } startCoordinate if overlapping, null if should stop processing, undefined otherwise
407436 */
408437 checkLine (
409- metadata : IndexData ,
410438 regionRefName : string ,
411439 regionStart : number ,
412440 regionEnd : number ,
413441 line : string ,
442+ columnNumbersEffective : { ref : number ; start : number ; end : number } ,
443+ maxColumn : number ,
444+ metaCharCode : number | undefined ,
445+ coordinateOffset : number ,
446+ isVCF : boolean ,
447+ isIdentityRename : boolean ,
414448 ) {
415- const { columnNumbers, metaChar, coordinateType, format } = metadata
416- // skip meta lines
417- if ( metaChar && line . startsWith ( metaChar ) ) {
418- return { overlaps : false }
449+ if ( metaCharCode !== undefined && line . charCodeAt ( 0 ) === metaCharCode ) {
450+ return undefined
419451 }
420452
421- // check ref/start/end using column metadata from index
422- let { ref, start, end } = columnNumbers
423- if ( ! ref ) {
424- ref = 0
425- }
426- if ( ! start ) {
427- start = 0
428- }
429- if ( ! end ) {
430- end = 0
431- }
432- if ( format === 'VCF' ) {
433- end = 8
434- }
435- const maxColumn = Math . max ( ref , start , end )
436-
437- // this code is kind of complex, but it is fairly fast. basically, we want
438- // to avoid doing a split, because if the lines are really long that could
439- // lead to us allocating a bunch of extra memory, which is slow
440-
441- let currentColumnNumber = 1 // cols are numbered starting at 1 in the index metadata
453+ let currentColumnNumber = 1
442454 let currentColumnStart = 0
443455 let refSeq = ''
444456 let startCoordinate = - Infinity
445457 const l = line . length
446- for ( let i = 0 ; i < l + 1 ; i ++ ) {
447- if ( line [ i ] === '\t' || i === l ) {
448- if ( currentColumnNumber === ref ) {
449- if (
450- this . renameRefSeq ( line . slice ( currentColumnStart , i ) ) !==
458+ let tabPos = line . indexOf ( '\t' , currentColumnStart )
459+
460+ while ( currentColumnNumber <= maxColumn ) {
461+ const columnEnd = tabPos === - 1 ? l : tabPos
462+
463+ if ( currentColumnNumber === columnNumbersEffective . ref ) {
464+ const refMatch = isIdentityRename
465+ ? line . slice ( currentColumnStart , columnEnd ) === regionRefName
466+ : this . renameRefSeq ( line . slice ( currentColumnStart , columnEnd ) ) ===
451467 regionRefName
452- ) {
453- return {
454- overlaps : false ,
455- }
456- }
457- } else if ( currentColumnNumber === start ) {
458- startCoordinate = Number . parseInt (
459- line . slice ( currentColumnStart , i ) ,
460- 10 ,
461- )
462- // we convert to 0-based-half-open
463- if ( coordinateType === '1-based-closed' ) {
464- startCoordinate -= 1
465- }
466- if ( startCoordinate >= regionEnd ) {
467- return {
468- startCoordinate,
469- overlaps : false ,
470- }
471- }
472- if (
473- ( end === 0 || end === start ) && // if we have no end, we assume the feature is 1 bp long
474- startCoordinate + 1 <= regionStart
475- ) {
476- return {
477- startCoordinate,
478- overlaps : false ,
479- }
480- }
481- } else if ( format === 'VCF' && currentColumnNumber === 4 ) {
482- refSeq = line . slice ( currentColumnStart , i )
483- } else if ( currentColumnNumber === end ) {
484- // this will never match if there is no end column
485- const endCoordinate =
486- format === 'VCF'
487- ? this . _getVcfEnd (
488- startCoordinate ,
489- refSeq ,
490- line . slice ( currentColumnStart , i ) ,
491- )
492- : Number . parseInt ( line . slice ( currentColumnStart , i ) , 10 )
493- if ( endCoordinate <= regionStart ) {
494- return {
495- overlaps : false ,
496- }
497- }
468+ if ( ! refMatch ) {
469+ return undefined
498470 }
499- if ( currentColumnNumber === maxColumn ) {
500- break
471+ } else if ( currentColumnNumber === columnNumbersEffective . start ) {
472+ startCoordinate =
473+ Number . parseInt ( line . slice ( currentColumnStart , columnEnd ) , 10 ) +
474+ coordinateOffset
475+ if ( startCoordinate >= regionEnd ) {
476+ return null
477+ }
478+ if (
479+ ( columnNumbersEffective . end === 0 ||
480+ columnNumbersEffective . end === columnNumbersEffective . start ) &&
481+ startCoordinate + 1 <= regionStart
482+ ) {
483+ return undefined
484+ }
485+ } else if ( isVCF && currentColumnNumber === 4 ) {
486+ refSeq = line . slice ( currentColumnStart , columnEnd )
487+ } else if ( currentColumnNumber === columnNumbersEffective . end ) {
488+ const endCoordinate = isVCF
489+ ? this . _getVcfEnd (
490+ startCoordinate ,
491+ refSeq ,
492+ line . slice ( currentColumnStart , columnEnd ) ,
493+ )
494+ : Number . parseInt ( line . slice ( currentColumnStart , columnEnd ) , 10 )
495+ if ( endCoordinate <= regionStart ) {
496+ return undefined
501497 }
502- currentColumnStart = i + 1
503- currentColumnNumber += 1
504498 }
499+
500+ if ( currentColumnNumber === maxColumn ) {
501+ break
502+ }
503+
504+ currentColumnStart = columnEnd + 1
505+ currentColumnNumber += 1
506+ tabPos = line . indexOf ( '\t' , currentColumnStart )
505507 }
506- return {
507- startCoordinate,
508- overlaps : true ,
509- }
508+ return startCoordinate
510509 }
511510
512511 _getVcfEnd ( startCoordinate : number , refSeq : string , info : any ) {
513512 let endCoordinate = startCoordinate + refSeq . length
514- // ignore TRA features as they specify CHR2 and END as being on a different
515- // chromosome
516- //
517- // if CHR2 is on the same chromosome, still ignore it because there should
518- // be another pairwise feature at the end of this one
519513 const isTRA = info . includes ( 'SVTYPE=TRA' )
520- if ( info [ 0 ] !== '.' && ! isTRA ) {
521- const endRegex = / (?: ^ | ; ) E N D = ( [ ^ ; ] + ) /
522- const match = endRegex . exec ( info )
523- if ( match ) {
524- endCoordinate = Number . parseInt ( match [ 1 ] ! , 10 )
525- }
526- } else if ( isTRA ) {
514+ if ( isTRA ) {
527515 return startCoordinate + 1
528516 }
517+
518+ if ( info [ 0 ] !== '.' ) {
519+ const endIdx = info . indexOf ( 'END=' )
520+ if ( endIdx !== - 1 && ( endIdx === 0 || info [ endIdx - 1 ] === ';' ) ) {
521+ const start = endIdx + 4
522+ let end = info . indexOf ( ';' , start )
523+ if ( end === - 1 ) {
524+ end = info . length
525+ }
526+ endCoordinate = Number . parseInt ( info . slice ( start , end ) , 10 )
527+ }
528+ }
529529 return endCoordinate
530530 }
531531
0 commit comments