Skip to content

Commit 775e35c

Browse files
committed
Merge pull request #411 from hammerlab/issue-410
Better handle 2bit files with huge headers
2 parents e768565 + 0efc85c commit 775e35c

File tree

6 files changed

+102
-10
lines changed

6 files changed

+102
-10
lines changed

.travis.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
sudo: false # Use container-based infrastructure
22
language: node_js
33
node_js:
4-
- "0.12"
4+
- "5.1"
55

66
script: >
77
npm run build &&

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@
8282
"mocha-phantomjs-istanbul": "0.0.2",
8383
"number-to-locale-string": "^1.0.0",
8484
"parse-data-uri": "^0.2.0",
85-
"phantomjs": "^1.9.17",
85+
"phantomjs": "1.9.17",
8686
"prepush-hook": "^0.1.0",
8787
"react-addons-test-utils": "^0.14.0",
8888
"sinon": "^1.12.2",

src/main/data/TwoBit.js

Lines changed: 92 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,18 @@ var BASE_PAIRS = [
2020
'G' // 3=11
2121
];
2222

23+
/**
24+
The following chunk sizes are optimized against
25+
the human reference genome (hg19.2bit). Assuming that
26+
pileup.js is mostly being used for human genome,
27+
increasing the following numbers might cause nonnecessary
28+
network traffic and might also break our unit tests
29+
that make use of mapped 2bit files.
30+
*/
31+
var FIRST_HEADER_CHUNKSIZE = 16 * 1024, // 16 KB
32+
FIRST_SEQUENCE_CHUNKSIZE = (4 * 1024) - 1, // ~ 4KB
33+
MAX_CHUNKSIZE = 1024 * 1024; // 1 MB
34+
2335
type FileIndexEntry = {
2436
name: string;
2537
offset: number;
@@ -125,6 +137,66 @@ function markUnknownDNA(basePairs: Array<string>, dnaStartIndex: number, sequenc
125137
}
126138

127139

140+
/**
141+
* An umbrella error type to describe issues with parsing an
142+
* incomplete chunk of data with JBinary's read. If this is being
143+
* raised, we either need to ask for more data (a bigger chunk) or
144+
* report to the user that there might be a problem with the 2bit
145+
* file, specifically with its header.
146+
*/
147+
function IncompleteChunkError(message) {
148+
this.name = "IncompleteChunkError";
149+
this.message = (message || "");
150+
}
151+
IncompleteChunkError.prototype = Error.prototype;
152+
153+
/**
154+
* Wraps a parsing attempt, captures errors related to
155+
* incomplete data and re-throws a specialized error:
156+
* IncompleteChunkError. Otherwise, whatever other error
157+
* is being raised gets escalated.
158+
*/
159+
function parseWithException(parseFunc: Function) {
160+
try {
161+
return parseFunc();
162+
} catch(error) {
163+
// Chrome-like browsers: RangeError; phantomjs: DOMException
164+
if (error.name == "RangeError" || error.name == "INDEX_SIZE_ERR") {
165+
console.log(`Error name: ${error.name}`);
166+
throw new IncompleteChunkError(error);
167+
} else {
168+
throw error;
169+
}
170+
}
171+
}
172+
173+
/**
174+
* Try getting a bigger chunk of the remote file
175+
* until the Incomplete Chunk Error is resolved. This is useful when we need to
176+
* parse the header, but when we don't know the size of the header up front.
177+
* If the intial request returns an incomplete header and hence the
178+
* parsing fails, we next try doubling the requested size.
179+
* The size of the request is capped with `untilSize` so that
180+
* we don't start asking for MBs of data for no use.
181+
* Instead we we throw an error if we reach the cap,
182+
* potentially meaning a corrupted 2bit file.
183+
*/
184+
function retryRemoteGet(remoteFile: RemoteFile, start: number, size: number, untilSize: number, promisedFunc: Function) {
185+
return remoteFile.getBytes(start, size).then(promisedFunc).catch(error => {
186+
if(error.name == "IncompleteChunkError") {
187+
// Do not attempt to download more than `untilSize`
188+
if(size > untilSize) {
189+
throw `Couldn't parse the header ` +
190+
`from the first ${size} bytes of the file. ` +
191+
`Corrupted 2bit file?`;
192+
}
193+
return retryRemoteGet(remoteFile, start, size*2, untilSize, promisedFunc);
194+
} else {
195+
throw error;
196+
}
197+
});
198+
}
199+
128200
class TwoBit {
129201
remoteFile: RemoteFile;
130202
header: Q.Promise<TwoBitHeader>;
@@ -133,10 +205,15 @@ class TwoBit {
133205
this.remoteFile = remoteFile;
134206
var deferredHeader = Q.defer();
135207
this.header = deferredHeader.promise;
136-
137-
// TODO: if 16k is insufficient, fetch the right amount.
138-
this.remoteFile.getBytes(0, 16*1024).then(function(buffer) {
139-
var header = parseHeader(buffer);
208+
retryRemoteGet(
209+
this.remoteFile,
210+
0, // Beginning of the file
211+
FIRST_HEADER_CHUNKSIZE,
212+
MAX_CHUNKSIZE,
213+
buffer => {
214+
var header = parseWithException(() => {
215+
return parseHeader(buffer);
216+
});
140217
deferredHeader.resolve(header);
141218
}).done();
142219
}
@@ -178,9 +255,17 @@ class TwoBit {
178255
}
179256
var seq = maybeSeq; // for flow, see facebook/flow#266
180257

181-
// TODO: if 4k is insufficient, fetch the right amount.
182-
return this.remoteFile.getBytes(seq.offset, 4095).then(
183-
buf => parseSequenceRecord(buf, seq.offset));
258+
return retryRemoteGet(
259+
this.remoteFile,
260+
seq.offset,
261+
FIRST_SEQUENCE_CHUNKSIZE,
262+
MAX_CHUNKSIZE,
263+
buffer => {
264+
return parseWithException(() => {
265+
return parseSequenceRecord(buffer, seq.offset);
266+
});
267+
}
268+
);
184269
});
185270
}
186271
}

src/main/sources/TwoBitDataSource.js

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,6 @@ var createFromTwoBitFile = function(remoteSource: TwoBit): TwoBitSource {
7171
return Q.when(); // empty promise
7272
}
7373

74-
console.log(`Fetching ${span} base pairs`);
7574
remoteSource.getFeaturesInRange(range.contig, range.start(), range.stop())
7675
.then(letters => {
7776
if (!letters) return;

src/test/data/TwoBit-test.js

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,5 +46,13 @@ describe('TwoBit', function() {
4646
});
4747
});
4848

49+
it('should parse huge headers', function() {
50+
var twoBit = new TwoBit(new RemoteFile('/test-data/susScr3-head.2bit'));
51+
// shouldn't throw an exception
52+
return twoBit.header.then(header => {
53+
expect(header.sequenceCount).to.equal(4583);
54+
});
55+
});
56+
4957
// TODO: masked regions
5058
});

test-data/susScr3-head.2bit

488 KB
Binary file not shown.

0 commit comments

Comments
 (0)