@@ -7,7 +7,6 @@ import Chunk from './chunk.ts'
77import CSI from './csi.ts'
88import IndexFile , { IndexData , Options } from './indexFile.ts'
99import TBI from './tbi.ts'
10- import { checkAbortSignal } from './util.ts'
1110
1211import type { GenericFilehandle } from 'generic-filehandle2'
1312
@@ -196,7 +195,6 @@ export default class TabixIndexedFile {
196195 }
197196
198197 const metadata = await this . index . getMetadata ( options )
199- checkAbortSignal ( signal )
200198 const start = s ?? 0
201199 const end = e ?? metadata . maxRefLength
202200 if ( ! ( start <= end ) ) {
@@ -209,9 +207,22 @@ export default class TabixIndexedFile {
209207 }
210208
211209 const chunks = await this . index . blocksForRange ( refName , start , end , options )
212- checkAbortSignal ( signal )
213210 const decoder = new TextDecoder ( 'utf8' )
214211
212+ const isVCF = metadata . format === 'VCF'
213+ const columnNumbersEffective = {
214+ ref : metadata . columnNumbers . ref || 0 ,
215+ start : metadata . columnNumbers . start || 0 ,
216+ end : isVCF ? 8 : metadata . columnNumbers . end || 0 ,
217+ }
218+ const maxColumn = Math . max (
219+ columnNumbersEffective . ref ,
220+ columnNumbersEffective . start ,
221+ columnNumbersEffective . end ,
222+ )
223+ const metaCharCode = metadata . metaChar ?. charCodeAt ( 0 )
224+ const coordinateType = metadata . coordinateType
225+
215226 // now go through each chunk and parse and filter the lines out of it
216227 for ( const c of chunks ) {
217228 const { buffer, cpositions, dpositions } = await this . chunkCache . get (
@@ -220,7 +231,6 @@ export default class TabixIndexedFile {
220231 signal ,
221232 )
222233
223- checkAbortSignal ( signal )
224234 let blockStart = 0
225235 let pos = 0
226236
@@ -251,15 +261,24 @@ export default class TabixIndexedFile {
251261 }
252262
253263 // filter the line for whether it is within the requested range
254- const { startCoordinate, overlaps } = this . checkLine (
255- metadata ,
264+ const result = this . checkLine (
256265 refName ,
257266 start ,
258267 end ,
259268 line ,
269+ columnNumbersEffective ,
270+ maxColumn ,
271+ metaCharCode ,
272+ coordinateType ,
273+ isVCF ,
260274 )
261275
262- if ( overlaps ) {
276+ if ( result === null ) {
277+ // the lines were overlapping the region, but now have stopped, so we
278+ // must be at the end of the relevant data and we can stop processing
279+ // data now
280+ return
281+ } else if ( result !== undefined ) {
263282 callback (
264283 line ,
265284 this . calculateFileOffset (
@@ -270,15 +289,11 @@ export default class TabixIndexedFile {
270289 c . minv . dataPosition ,
271290 ) ,
272291 )
273- } else if ( startCoordinate !== undefined && startCoordinate >= end ) {
274- // the lines were overlapping the region, but now have stopped, so we
275- // must be at the end of the relevant data and we can stop processing
276- // data now
277- return
278292 }
279293 blockStart = n + 1
280294 }
281295 } else {
296+ console . log ( 'not ascii?' )
282297 while ( blockStart < buffer . length ) {
283298 const n = buffer . indexOf ( '\n' . charCodeAt ( 0 ) , blockStart )
284299 if ( n === - 1 ) {
@@ -296,15 +311,24 @@ export default class TabixIndexedFile {
296311 }
297312
298313 // filter the line for whether it is within the requested range
299- const { startCoordinate, overlaps } = this . checkLine (
300- metadata ,
314+ const result = this . checkLine (
301315 refName ,
302316 start ,
303317 end ,
304318 line ,
319+ columnNumbersEffective ,
320+ maxColumn ,
321+ metaCharCode ,
322+ coordinateType ,
323+ isVCF ,
305324 )
306325
307- if ( overlaps ) {
326+ if ( result === null ) {
327+ // the lines were overlapping the region, but now have stopped, so we
328+ // must be at the end of the relevant data and we can stop processing
329+ // data now
330+ return
331+ } else if ( result !== undefined ) {
308332 callback (
309333 line ,
310334 this . calculateFileOffset (
@@ -315,11 +339,6 @@ export default class TabixIndexedFile {
315339 c . minv . dataPosition ,
316340 ) ,
317341 )
318- } else if ( startCoordinate !== undefined && startCoordinate >= end ) {
319- // the lines were overlapping the region, but now have stopped, so we
320- // must be at the end of the relevant data and we can stop processing
321- // data now
322- return
323342 }
324343 blockStart = n + 1
325344 }
@@ -339,8 +358,6 @@ export default class TabixIndexedFile {
339358 const { firstDataLine, metaChar, maxBlockSize } =
340359 await this . getMetadata ( opts )
341360
342- checkAbortSignal ( opts . signal )
343-
344361 const maxFetch = ( firstDataLine ?. blockPosition || 0 ) + maxBlockSize
345362 // TODO: what if we don't have a firstDataLine, and the header actually
346363 // takes up more than one block? this case is not covered here
@@ -391,141 +408,120 @@ export default class TabixIndexedFile {
391408 }
392409
393410 /**
394- * @param {object } metadata metadata object from the parsed index, containing
395- * columnNumbers, metaChar, and format
396- *
397411 * @param {string } regionRefName
398412 *
399413 * @param {number } regionStart region start coordinate (0-based-half-open)
400414 *
401415 * @param {number } regionEnd region end coordinate (0-based-half-open)
402416 *
403- * @param {array[string] } line
417+ * @param {string } line
418+ *
419+ * @param {object } columnNumbersEffective pre-calculated column numbers
420+ *
421+ * @param {number } maxColumn pre-calculated max column
422+ *
423+ * @param {number } metaCharCode pre-calculated metaChar code
404424 *
405- * @returns {object } like `{startCoordinate, overlaps}`. overlaps is boolean,
406- * true if line is a data line that overlaps the given region
425+ * @param {string } coordinateType coordinate type from metadata
426+ *
427+ * @param {boolean } isVCF whether this is VCF format
428+ *
429+ * @returns {number | null | undefined } startCoordinate if overlapping, null if should stop processing, undefined otherwise
407430 */
408431 checkLine (
409- metadata : IndexData ,
410432 regionRefName : string ,
411433 regionStart : number ,
412434 regionEnd : number ,
413435 line : string ,
436+ columnNumbersEffective : { ref : number ; start : number ; end : number } ,
437+ maxColumn : number ,
438+ metaCharCode : number | undefined ,
439+ coordinateType : string ,
440+ isVCF : boolean ,
414441 ) {
415- const { columnNumbers, metaChar, coordinateType, format } = metadata
416- // skip meta lines
417- if ( metaChar && line . startsWith ( metaChar ) ) {
418- return { overlaps : false }
419- }
420-
421- // check ref/start/end using column metadata from index
422- let { ref, start, end } = columnNumbers
423- if ( ! ref ) {
424- ref = 0
425- }
426- if ( ! start ) {
427- start = 0
428- }
429- if ( ! end ) {
430- end = 0
431- }
432- if ( format === 'VCF' ) {
433- end = 8
442+ if ( metaCharCode !== undefined && line . charCodeAt ( 0 ) === metaCharCode ) {
443+ return undefined
434444 }
435- const maxColumn = Math . max ( ref , start , end )
436445
437- // this code is kind of complex, but it is fairly fast. basically, we want
438- // to avoid doing a split, because if the lines are really long that could
439- // lead to us allocating a bunch of extra memory, which is slow
440-
441- let currentColumnNumber = 1 // cols are numbered starting at 1 in the index metadata
446+ let currentColumnNumber = 1
442447 let currentColumnStart = 0
443448 let refSeq = ''
444449 let startCoordinate = - Infinity
445450 const l = line . length
446- for ( let i = 0 ; i < l + 1 ; i ++ ) {
447- if ( line [ i ] === '\t' || i === l ) {
448- if ( currentColumnNumber === ref ) {
449- if (
450- this . renameRefSeq ( line . slice ( currentColumnStart , i ) ) !==
451- regionRefName
452- ) {
453- return {
454- overlaps : false ,
455- }
456- }
457- } else if ( currentColumnNumber === start ) {
458- startCoordinate = Number . parseInt (
459- line . slice ( currentColumnStart , i ) ,
460- 10 ,
461- )
462- // we convert to 0-based-half-open
463- if ( coordinateType === '1-based-closed' ) {
464- startCoordinate -= 1
465- }
466- if ( startCoordinate >= regionEnd ) {
467- return {
468- startCoordinate,
469- overlaps : false ,
470- }
471- }
472- if (
473- ( end === 0 || end === start ) && // if we have no end, we assume the feature is 1 bp long
474- startCoordinate + 1 <= regionStart
475- ) {
476- return {
477- startCoordinate,
478- overlaps : false ,
479- }
480- }
481- } else if ( format === 'VCF' && currentColumnNumber === 4 ) {
482- refSeq = line . slice ( currentColumnStart , i )
483- } else if ( currentColumnNumber === end ) {
484- // this will never match if there is no end column
485- const endCoordinate =
486- format === 'VCF'
487- ? this . _getVcfEnd (
488- startCoordinate ,
489- refSeq ,
490- line . slice ( currentColumnStart , i ) ,
491- )
492- : Number . parseInt ( line . slice ( currentColumnStart , i ) , 10 )
493- if ( endCoordinate <= regionStart ) {
494- return {
495- overlaps : false ,
496- }
497- }
451+ let tabPos = line . indexOf ( '\t' , currentColumnStart )
452+
453+ while ( currentColumnNumber <= maxColumn ) {
454+ const columnEnd = tabPos === - 1 ? l : tabPos
455+
456+ if ( currentColumnNumber === columnNumbersEffective . ref ) {
457+ if (
458+ this . renameRefSeq ( line . slice ( currentColumnStart , columnEnd ) ) !==
459+ regionRefName
460+ ) {
461+ return undefined
498462 }
499- if ( currentColumnNumber === maxColumn ) {
500- break
463+ } else if ( currentColumnNumber === columnNumbersEffective . start ) {
464+ startCoordinate = Number . parseInt (
465+ line . slice ( currentColumnStart , columnEnd ) ,
466+ 10 ,
467+ )
468+ if ( coordinateType === '1-based-closed' ) {
469+ startCoordinate -= 1
470+ }
471+ if ( startCoordinate >= regionEnd ) {
472+ return null
473+ }
474+ if (
475+ ( columnNumbersEffective . end === 0 ||
476+ columnNumbersEffective . end === columnNumbersEffective . start ) &&
477+ startCoordinate + 1 <= regionStart
478+ ) {
479+ return undefined
480+ }
481+ } else if ( isVCF && currentColumnNumber === 4 ) {
482+ refSeq = line . slice ( currentColumnStart , columnEnd )
483+ } else if ( currentColumnNumber === columnNumbersEffective . end ) {
484+ const endCoordinate = isVCF
485+ ? this . _getVcfEnd (
486+ startCoordinate ,
487+ refSeq ,
488+ line . slice ( currentColumnStart , columnEnd ) ,
489+ )
490+ : Number . parseInt ( line . slice ( currentColumnStart , columnEnd ) , 10 )
491+ if ( endCoordinate <= regionStart ) {
492+ return undefined
501493 }
502- currentColumnStart = i + 1
503- currentColumnNumber += 1
504494 }
495+
496+ if ( currentColumnNumber === maxColumn ) {
497+ break
498+ }
499+
500+ currentColumnStart = columnEnd + 1
501+ currentColumnNumber += 1
502+ tabPos = line . indexOf ( '\t' , currentColumnStart )
505503 }
506- return {
507- startCoordinate,
508- overlaps : true ,
509- }
504+ return startCoordinate
510505 }
511506
512507 _getVcfEnd ( startCoordinate : number , refSeq : string , info : any ) {
513508 let endCoordinate = startCoordinate + refSeq . length
514- // ignore TRA features as they specify CHR2 and END as being on a different
515- // chromosome
516- //
517- // if CHR2 is on the same chromosome, still ignore it because there should
518- // be another pairwise feature at the end of this one
519509 const isTRA = info . includes ( 'SVTYPE=TRA' )
520- if ( info [ 0 ] !== '.' && ! isTRA ) {
521- const endRegex = / (?: ^ | ; ) E N D = ( [ ^ ; ] + ) /
522- const match = endRegex . exec ( info )
523- if ( match ) {
524- endCoordinate = Number . parseInt ( match [ 1 ] ! , 10 )
525- }
526- } else if ( isTRA ) {
510+ if ( isTRA ) {
527511 return startCoordinate + 1
528512 }
513+
514+ if ( info [ 0 ] !== '.' ) {
515+ const endIdx = info . indexOf ( 'END=' )
516+ if ( endIdx !== - 1 && ( endIdx === 0 || info [ endIdx - 1 ] === ';' ) ) {
517+ const start = endIdx + 4
518+ let end = info . indexOf ( ';' , start )
519+ if ( end === - 1 ) {
520+ end = info . length
521+ }
522+ endCoordinate = Number . parseInt ( info . slice ( start , end ) , 10 )
523+ }
524+ }
529525 return endCoordinate
530526 }
531527
0 commit comments