Skip to content

Commit 96d4729

Browse files
committed
Tighten up the file sniffer findMimeType() implementation and improve test coverage
1 parent f86760a commit 96d4729

File tree

2 files changed

+12
-31
lines changed

2 files changed

+12
-31
lines changed

file/sniffer.js

Lines changed: 11 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,8 @@
77
* Copyright(c) 2020 Google Inc.
88
*/
99

10-
// A Container Format is a file that embeds multiple data streams into a single file.
11-
// Examples:
12-
// 1) the ISO-BMFF family (MP4, HEVC, AVIF, MOV/QT, etc)
13-
// 2) the Matroska family (MKV, WebM)
14-
// 3) the RIFF family (WAV, AVI, WebP)
15-
// 4) the OGG family (OGV, OPUS)
16-
// 5) the ZIP family (ZIP, JAR, CBZ, EPUB, ODF, OOXML)
17-
18-
// The ISO-BMFF container needs special processing because of its "compatible brands" array :(
19-
// The Matroska container needs special processing because the sub-type can appear anywhere :(
20-
// The OGG container needs special processing to determine what kind of streams are present :(
21-
// The ZIP container needs special processing to determine what files are present inside it :(
10+
// https://mimesniff.spec.whatwg.org/ is a good resource.
11+
// https://github.com/h2non/filetype is an easy target for reverse-engineering.
2212

2313
// NOTE: Because the ICO format also starts with a couple zero bytes, this tree will rely on the
2414
// File Type box never going beyond 255 bytes in length which, seems unlikely according to
@@ -72,13 +62,9 @@ const fileSignatures = {
7262
'font/woff2': [[0x77, 0x4F, 0x46, 0x32]], // 'wOF2'
7363
};
7464

75-
// TODO: Eventually add support for various container formats so that:
76-
// * an OGG container can be resolved to OGG Audio, OGG Video
77-
// * an HEIF container can be resolved to AVIF, HEIC
78-
7965
/**
80-
* Represents a single byte in the tree. If this node terminates a known MIME type (see magic
81-
* numbers above), then the mimeType field will be set.
66+
* Represents a single byte in the magic number tree. If this node terminates a known MIME type
67+
* (see magic numbers above), then the mimeType field will be set.
8268
*/
8369
class Node {
8470
/** @type {string} */
@@ -133,9 +119,9 @@ export function initialize() {
133119
}
134120

135121
if (curNode.mimeType) {
136-
throw `File signature collision: ${curNode.mimeType} overlaps with ${mimeType}`;
122+
throw `Magic number collision: ${curNode.mimeType} overlaps with ${mimeType}`;
137123
} else if (Object.keys(curNode.children).length > 0) {
138-
throw `${mimeType} signature is not unique, it collides with other mime types`;
124+
throw `${mimeType} magic number is not unique, it collides with other mime types`;
139125
}
140126
curNode.mimeType = mimeType;
141127
} // for each signature
@@ -152,23 +138,17 @@ export function findMimeType(ab) {
152138
initialize();
153139
}
154140

155-
const depth = ab.byteLength < maxDepth ? ab.byteLength : maxDepth;
156-
const arr = new Uint8Array(ab).subarray(0, depth);
141+
const arr = new Uint8Array(ab);
157142
let curNode = root;
158143
let mimeType;
159144
// Step through bytes, updating curNode as it walks down the byte tree.
160145
for (const byte of arr) {
161-
// If this node has a placeholder child, just step into it.
162-
if (curNode.children['??']) {
163-
curNode = curNode.children['??'];
164-
continue;
165-
}
166-
if (curNode.children[byte] === undefined) return undefined;
167-
curNode = curNode.children[byte];
168-
if (curNode.mimeType) {
169-
mimeType = curNode.mimeType;
146+
// If we found the mimeType or it is unknown, break the loop.
147+
if (!curNode || (mimeType = curNode.mimeType)) {
170148
break;
171149
}
150+
// Move into the next byte's node (if it exists) or the placeholder node (if it exists).
151+
curNode = curNode.children[byte] || curNode.children['??'];
172152
}
173153
return mimeType;
174154
}

tests/file-sniffer.spec.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ function sniffTest(mimeType, sig) {
1717
}
1818

1919
describe('bitjs.file.sniffer', () => {
20+
it('handles unknown', () => { sniffTest(undefined, [0x01, 0x99, 0xde, 0xad])});
2021
it('BMP', () => { sniffTest('image/bmp', [0x42, 0x4D, 0x46]); });
2122
it('GIF', () => { sniffTest('image/gif', [0x47, 0x49, 0x46, 0x38, 0x20]); });
2223
it('JPG', () => { sniffTest('image/jpeg', [0xFF, 0xD8, 0xFF, 0x23]); });

0 commit comments

Comments
 (0)