@@ -7,7 +7,6 @@ import Chunk from './chunk.ts'
77import CSI from './csi.ts'
88import IndexFile , { IndexData , Options } from './indexFile.ts'
99import TBI from './tbi.ts'
10- import { checkAbortSignal } from './util.ts'
1110
1211import type { GenericFilehandle } from 'generic-filehandle2'
1312
@@ -196,7 +195,6 @@ export default class TabixIndexedFile {
196195 }
197196
198197 const metadata = await this . index . getMetadata ( options )
199- checkAbortSignal ( signal )
200198 const start = s ?? 0
201199 const end = e ?? metadata . maxRefLength
202200 if ( ! ( start <= end ) ) {
@@ -209,9 +207,22 @@ export default class TabixIndexedFile {
209207 }
210208
211209 const chunks = await this . index . blocksForRange ( refName , start , end , options )
212- checkAbortSignal ( signal )
213210 const decoder = new TextDecoder ( 'utf8' )
214211
212+ const isVCF = metadata . format === 'VCF'
213+ const columnNumbersEffective = {
214+ ref : metadata . columnNumbers . ref || 0 ,
215+ start : metadata . columnNumbers . start || 0 ,
216+ end : isVCF ? 8 : metadata . columnNumbers . end || 0 ,
217+ }
218+ const maxColumn = Math . max (
219+ columnNumbersEffective . ref ,
220+ columnNumbersEffective . start ,
221+ columnNumbersEffective . end ,
222+ )
223+ const metaCharCode = metadata . metaChar ?. charCodeAt ( 0 )
224+ const coordinateType = metadata . coordinateType
225+
215226 // now go through each chunk and parse and filter the lines out of it
216227 for ( const c of chunks ) {
217228 const { buffer, cpositions, dpositions } = await this . chunkCache . get (
@@ -220,7 +231,6 @@ export default class TabixIndexedFile {
220231 signal ,
221232 )
222233
223- checkAbortSignal ( signal )
224234 let blockStart = 0
225235 let pos = 0
226236
@@ -251,15 +261,24 @@ export default class TabixIndexedFile {
251261 }
252262
253263 // filter the line for whether it is within the requested range
254- const { startCoordinate, overlaps } = this . checkLine (
255- metadata ,
264+ const result = this . checkLine (
256265 refName ,
257266 start ,
258267 end ,
259268 line ,
269+ columnNumbersEffective ,
270+ maxColumn ,
271+ metaCharCode ,
272+ coordinateType ,
273+ isVCF ,
260274 )
261275
262- if ( overlaps ) {
276+ if ( result === null ) {
277+ // the lines were overlapping the region, but now have stopped, so we
278+ // must be at the end of the relevant data and we can stop processing
279+ // data now
280+ return
281+ } else if ( result !== undefined ) {
263282 callback (
264283 line ,
265284 this . calculateFileOffset (
@@ -270,11 +289,6 @@ export default class TabixIndexedFile {
270289 c . minv . dataPosition ,
271290 ) ,
272291 )
273- } else if ( startCoordinate !== undefined && startCoordinate >= end ) {
274- // the lines were overlapping the region, but now have stopped, so we
275- // must be at the end of the relevant data and we can stop processing
276- // data now
277- return
278292 }
279293 blockStart = n + 1
280294 }
@@ -296,15 +310,24 @@ export default class TabixIndexedFile {
296310 }
297311
298312 // filter the line for whether it is within the requested range
299- const { startCoordinate, overlaps } = this . checkLine (
300- metadata ,
313+ const result = this . checkLine (
301314 refName ,
302315 start ,
303316 end ,
304317 line ,
318+ columnNumbersEffective ,
319+ maxColumn ,
320+ metaCharCode ,
321+ coordinateType ,
322+ isVCF ,
305323 )
306324
307- if ( overlaps ) {
325+ if ( result === null ) {
326+ // the lines were overlapping the region, but now have stopped, so we
327+ // must be at the end of the relevant data and we can stop processing
328+ // data now
329+ return
330+ } else if ( result !== undefined ) {
308331 callback (
309332 line ,
310333 this . calculateFileOffset (
@@ -315,11 +338,6 @@ export default class TabixIndexedFile {
315338 c . minv . dataPosition ,
316339 ) ,
317340 )
318- } else if ( startCoordinate !== undefined && startCoordinate >= end ) {
319- // the lines were overlapping the region, but now have stopped, so we
320- // must be at the end of the relevant data and we can stop processing
321- // data now
322- return
323341 }
324342 blockStart = n + 1
325343 }
@@ -339,8 +357,6 @@ export default class TabixIndexedFile {
339357 const { firstDataLine, metaChar, maxBlockSize } =
340358 await this . getMetadata ( opts )
341359
342- checkAbortSignal ( opts . signal )
343-
344360 const maxFetch = ( firstDataLine ?. blockPosition || 0 ) + maxBlockSize
345361 // TODO: what if we don't have a firstDataLine, and the header actually
346362 // takes up more than one block? this case is not covered here
@@ -391,141 +407,120 @@ export default class TabixIndexedFile {
391407 }
392408
393409 /**
394- * @param {object } metadata metadata object from the parsed index, containing
395- * columnNumbers, metaChar, and format
396- *
397410 * @param {string } regionRefName
398411 *
399412 * @param {number } regionStart region start coordinate (0-based-half-open)
400413 *
401414 * @param {number } regionEnd region end coordinate (0-based-half-open)
402415 *
403- * @param {array[string] } line
416+ * @param {string } line
417+ *
418+ * @param {object } columnNumbersEffective pre-calculated column numbers
419+ *
420+ * @param {number } maxColumn pre-calculated max column
421+ *
422+ * @param {number } metaCharCode pre-calculated metaChar code
404423 *
405- * @returns {object } like `{startCoordinate, overlaps}`. overlaps is boolean,
406- * true if line is a data line that overlaps the given region
424+ * @param {string } coordinateType coordinate type from metadata
425+ *
426+ * @param {boolean } isVCF whether this is VCF format
427+ *
428+ * @returns {number | null | undefined } startCoordinate if overlapping, null if should stop processing, undefined otherwise
407429 */
408430 checkLine (
409- metadata : IndexData ,
410431 regionRefName : string ,
411432 regionStart : number ,
412433 regionEnd : number ,
413434 line : string ,
435+ columnNumbersEffective : { ref : number ; start : number ; end : number } ,
436+ maxColumn : number ,
437+ metaCharCode : number | undefined ,
438+ coordinateType : string ,
439+ isVCF : boolean ,
414440 ) {
415- const { columnNumbers, metaChar, coordinateType, format } = metadata
416- // skip meta lines
417- if ( metaChar && line . startsWith ( metaChar ) ) {
418- return { overlaps : false }
419- }
420-
421- // check ref/start/end using column metadata from index
422- let { ref, start, end } = columnNumbers
423- if ( ! ref ) {
424- ref = 0
425- }
426- if ( ! start ) {
427- start = 0
428- }
429- if ( ! end ) {
430- end = 0
431- }
432- if ( format === 'VCF' ) {
433- end = 8
441+ if ( metaCharCode !== undefined && line . charCodeAt ( 0 ) === metaCharCode ) {
442+ return undefined
434443 }
435- const maxColumn = Math . max ( ref , start , end )
436444
437- // this code is kind of complex, but it is fairly fast. basically, we want
438- // to avoid doing a split, because if the lines are really long that could
439- // lead to us allocating a bunch of extra memory, which is slow
440-
441- let currentColumnNumber = 1 // cols are numbered starting at 1 in the index metadata
445+ let currentColumnNumber = 1
442446 let currentColumnStart = 0
443447 let refSeq = ''
444448 let startCoordinate = - Infinity
445449 const l = line . length
446- for ( let i = 0 ; i < l + 1 ; i ++ ) {
447- if ( line [ i ] === '\t' || i === l ) {
448- if ( currentColumnNumber === ref ) {
449- if (
450- this . renameRefSeq ( line . slice ( currentColumnStart , i ) ) !==
451- regionRefName
452- ) {
453- return {
454- overlaps : false ,
455- }
456- }
457- } else if ( currentColumnNumber === start ) {
458- startCoordinate = Number . parseInt (
459- line . slice ( currentColumnStart , i ) ,
460- 10 ,
461- )
462- // we convert to 0-based-half-open
463- if ( coordinateType === '1-based-closed' ) {
464- startCoordinate -= 1
465- }
466- if ( startCoordinate >= regionEnd ) {
467- return {
468- startCoordinate,
469- overlaps : false ,
470- }
471- }
472- if (
473- ( end === 0 || end === start ) && // if we have no end, we assume the feature is 1 bp long
474- startCoordinate + 1 <= regionStart
475- ) {
476- return {
477- startCoordinate,
478- overlaps : false ,
479- }
480- }
481- } else if ( format === 'VCF' && currentColumnNumber === 4 ) {
482- refSeq = line . slice ( currentColumnStart , i )
483- } else if ( currentColumnNumber === end ) {
484- // this will never match if there is no end column
485- const endCoordinate =
486- format === 'VCF'
487- ? this . _getVcfEnd (
488- startCoordinate ,
489- refSeq ,
490- line . slice ( currentColumnStart , i ) ,
491- )
492- : Number . parseInt ( line . slice ( currentColumnStart , i ) , 10 )
493- if ( endCoordinate <= regionStart ) {
494- return {
495- overlaps : false ,
496- }
497- }
450+ let tabPos = line . indexOf ( '\t' , currentColumnStart )
451+
452+ while ( currentColumnNumber <= maxColumn ) {
453+ const columnEnd = tabPos === - 1 ? l : tabPos
454+
455+ if ( currentColumnNumber === columnNumbersEffective . ref ) {
456+ if (
457+ this . renameRefSeq ( line . slice ( currentColumnStart , columnEnd ) ) !==
458+ regionRefName
459+ ) {
460+ return undefined
498461 }
499- if ( currentColumnNumber === maxColumn ) {
500- break
462+ } else if ( currentColumnNumber === columnNumbersEffective . start ) {
463+ startCoordinate = Number . parseInt (
464+ line . slice ( currentColumnStart , columnEnd ) ,
465+ 10 ,
466+ )
467+ if ( coordinateType === '1-based-closed' ) {
468+ startCoordinate -= 1
469+ }
470+ if ( startCoordinate >= regionEnd ) {
471+ return null
472+ }
473+ if (
474+ ( columnNumbersEffective . end === 0 ||
475+ columnNumbersEffective . end === columnNumbersEffective . start ) &&
476+ startCoordinate + 1 <= regionStart
477+ ) {
478+ return undefined
479+ }
480+ } else if ( isVCF && currentColumnNumber === 4 ) {
481+ refSeq = line . slice ( currentColumnStart , columnEnd )
482+ } else if ( currentColumnNumber === columnNumbersEffective . end ) {
483+ const endCoordinate = isVCF
484+ ? this . _getVcfEnd (
485+ startCoordinate ,
486+ refSeq ,
487+ line . slice ( currentColumnStart , columnEnd ) ,
488+ )
489+ : Number . parseInt ( line . slice ( currentColumnStart , columnEnd ) , 10 )
490+ if ( endCoordinate <= regionStart ) {
491+ return undefined
501492 }
502- currentColumnStart = i + 1
503- currentColumnNumber += 1
504493 }
494+
495+ if ( currentColumnNumber === maxColumn ) {
496+ break
497+ }
498+
499+ currentColumnStart = columnEnd + 1
500+ currentColumnNumber += 1
501+ tabPos = line . indexOf ( '\t' , currentColumnStart )
505502 }
506- return {
507- startCoordinate,
508- overlaps : true ,
509- }
503+ return startCoordinate
510504 }
511505
512506 _getVcfEnd ( startCoordinate : number , refSeq : string , info : any ) {
513507 let endCoordinate = startCoordinate + refSeq . length
514- // ignore TRA features as they specify CHR2 and END as being on a different
515- // chromosome
516- //
517- // if CHR2 is on the same chromosome, still ignore it because there should
518- // be another pairwise feature at the end of this one
519508 const isTRA = info . includes ( 'SVTYPE=TRA' )
520- if ( info [ 0 ] !== '.' && ! isTRA ) {
521- const endRegex = / (?: ^ | ; ) E N D = ( [ ^ ; ] + ) /
522- const match = endRegex . exec ( info )
523- if ( match ) {
524- endCoordinate = Number . parseInt ( match [ 1 ] ! , 10 )
525- }
526- } else if ( isTRA ) {
509+ if ( isTRA ) {
527510 return startCoordinate + 1
528511 }
512+
513+ if ( info [ 0 ] !== '.' ) {
514+ const endIdx = info . indexOf ( 'END=' )
515+ if ( endIdx !== - 1 && ( endIdx === 0 || info [ endIdx - 1 ] === ';' ) ) {
516+ const start = endIdx + 4
517+ let end = info . indexOf ( ';' , start )
518+ if ( end === - 1 ) {
519+ end = info . length
520+ }
521+ endCoordinate = Number . parseInt ( info . slice ( start , end ) , 10 )
522+ }
523+ }
529524 return endCoordinate
530525 }
531526
0 commit comments