Skip to content

Commit 34ecd6a

Browse files
committed
fix: various bug fixes for filesystem mode
1 parent 5e89fb8 commit 34ecd6a

File tree

5 files changed

+36
-24
lines changed

5 files changed

+36
-24
lines changed

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ scrape(options).then((result) => {});
5858
* [filenameGenerator](#filenamegenerator) - generate filename for downloaded resource
5959
* [requestConcurrency](#requestconcurrency) - set maximum concurrent requests
6060
* [tempMode](#tempMode) - How to store data temporarily during processing
61+
* [tempDir](#tempMode) - The directory to use to store temp files when `tempMode === fs`
6162
* [plugins](#plugins) - plugins, allow to customize filenames, request options, response handling, saving to storage, etc.
6263

6364
Default options you can find in [lib/config/defaults.js](https://github.com/website-scraper/node-website-scraper/blob/master/lib/config/defaults.js) or get them using
@@ -206,7 +207,7 @@ How to store temporary data when processing
206207

207208
* `memory` - Data is store in memory in its raw format (default).
208209
* `memory-compressed` - Data is stored in memory but compressed using zlib. This is more memory efficient at the expense of CPU time spend compressing and decompressing.
209-
* `fs` / `filesystem` - Data is stored in temporary files on the filesystem. This is the most memory efficient but it is strongly recommended to only use this mode with a solid state drive.
210+
* `filesystem` - Data is stored in temporary files on the filesystem. This is the most memory efficient but it is strongly recommended to only use this mode with a solid state drive.
210211

211212
#### plugins
212213

lib/config/defaults.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,8 @@ const config = {
6464
maxRecursiveDepth: null,
6565
maxDepth: null,
6666
ignoreErrors: false,
67-
tempMode: 'memory' // 'memory-compressed', 'fs'
67+
tempMode: 'memory', // 'memory-compressed', 'fs'
68+
tempDir: undefined
6869
};
6970

7071
export default config;

lib/resource.js

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,17 @@ import { promisify } from 'util';
44
import crypto from 'crypto';
55
import fs from 'fs';
66
import path from 'path';
7-
import os from 'os';
87

98
const inflate = promisify(zlib.inflate);
109
const defalate = promisify(zlib.deflate);
1110

1211
class Resource {
13-
constructor (url, filename, tmpMode, tmpDir) {
14-
this.tmpMode = tmpMode || 'memory';
15-
if (tmpMode === 'fs' || tmpMode === 'filesystem') {
16-
this.tmpDir = tmpDir || fs.mkdtempSync(path.join(os.tmpdir(), 'website-scraper-'));
12+
constructor (url, filename, tempMode, tempDir) {
13+
this.tempMode = tempMode || 'memory';
14+
this.tempDir = tempDir;
15+
16+
if (this.tempMode === 'filesystem' && !this.tempDir) {
17+
throw new Error('tmpDir must be provided in tmpMode=filesystem');
1718
}
1819

1920
this.setUrl(url);
@@ -30,7 +31,7 @@ class Resource {
3031
}
3132

3233
createChild (url, filename) {
33-
const child = new Resource(url, filename, this.tmpMode, this.tmpDir);
34+
const child = new Resource(url, filename, this.tempMode, this.tempDir);
3435
let currentDepth = this.getDepth();
3536

3637
child.parent = this;
@@ -53,10 +54,10 @@ class Resource {
5354
}
5455

5556
setUrl (url) {
56-
if (this.tmpDir) {
57+
if (this.tempDir) {
5758
// Generate a unique filename based on the md5 hash of the url
5859
const tmpName = `${crypto.createHash('md5').update(url).digest('hex')}.txt`;
59-
this.tmpPath = path.join(this.tmpDir, tmpName);
60+
this.tmpPath = path.join(this.tempDir, tmpName);
6061
}
6162

6263
this.url = url;
@@ -71,30 +72,28 @@ class Resource {
7172
}
7273

7374
async getText () {
74-
switch (this.tmpMode) {
75+
switch (this.tempMode) {
7576
case 'memory':
7677
return await this._memoryRead();
7778
case 'memory-compressed':
7879
return await this._memoryReadCompressed();
79-
case 'fs':
8080
case 'filesystem':
8181
return await this._fsRead();
8282
default:
83-
throw new Error('Unknown tmpMode');
83+
throw new Error(`Unknown tempMode: ${this.tempMode}`);
8484
}
8585
}
8686

8787
async setText (text) {
88-
switch (this.tmpMode) {
88+
switch (this.tempMode) {
8989
case 'memory':
9090
return await this._memoryWrite(text);
9191
case 'memory-compressed':
9292
return await this._memoryWriteCompressed(text);
93-
case 'fs':
9493
case 'filesystem':
9594
return await this._fsWrite(text);
9695
default:
97-
throw new Error('Unknown tmpMode');
96+
throw new Error(`Unknown tempMode: ${this.tempMode}`);
9897
}
9998
}
10099

lib/scraper.js

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@ import {
1111
GenerateFilenameByTypePlugin,
1212
GetResourceReferencePlugin
1313
} from './plugins/index.js';
14+
import fs from 'fs';
15+
import path from 'path';
16+
import os from 'os';
1417

1518
import * as utils from './utils/index.js';
1619
const { extend, union, urlsEqual, getTypeByMime, getTypeByFilename, series } = utils;
@@ -47,7 +50,16 @@ class Scraper {
4750
requestResource: this.requestResource.bind(this),
4851
getReference: this.runActions.bind(this, 'getReference')
4952
});
50-
this.resources = this.options.urls.map(({url, filename}) => new Resource(url, filename, this.options.tmpMode));
53+
54+
logger.info('tmpMode', this.options.tempMode);
55+
if (this.options.tempMode === 'filesystem') {
56+
if (!this.options.tempDir) {
57+
this.options.tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'website-scraper-'));
58+
}
59+
logger.info('tmpDir', this.options.tempDir);
60+
}
61+
62+
this.resources = this.options.urls.map(({url, filename}) => new Resource(url, filename, this.options.tempMode, this.options.tempDir));
5163

5264
this.requestedResourcePromises = new NormalizedUrlMap(); // Map url -> request promise
5365
this.loadedResources = new NormalizedUrlMap(); // Map url -> resource

test/unit/resource-test.js

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ import 'should';
22
import Resource from '../../lib/resource.js';
33
import fs from 'fs/promises';
44
import '../utils/assertions.js';
5+
import path from 'path';
6+
import os from 'os';
57

68
describe('Resource', () => {
79
describe('#createChild', () => {
@@ -61,14 +63,11 @@ describe('Resource', () => {
6163
(await resource.getText()).should.eql(testString2);
6264
});
6365

64-
it('fs mode', async () => {
65-
const resource = new Resource('http://example.com', 'index.html', 'fs');
66-
resource.setEncoding('utf8');
67-
68-
resource.tmpDir.should.not.eql(undefined);
66+
it('filesystem mode', async () => {
67+
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'website-scraper-'));
6968

7069
try {
71-
const resource = new Resource('http://example.com', 'index.html', 'fs');
70+
const resource = new Resource('http://example.com', 'index.html', 'filesystem', tmpDir);
7271
resource.setEncoding('utf8');
7372

7473
await resource.setText(testString1);
@@ -77,7 +76,7 @@ describe('Resource', () => {
7776
await resource.setText(testString2);
7877
(await resource.getText()).should.eql(testString2);
7978
} finally {
80-
await fs.rm(resource.tmpDir, { recursive: true, force: true });
79+
await fs.rm(tmpDir, { recursive: true, force: true });
8180
}
8281
});
8382
});

0 commit comments

Comments
 (0)