Skip to content

Commit 5afb782

Browse files
authored
Add updateSources option (#319)
* replaceLinks option added * fix problem with not using self intead of this * oops, forgot to return. * fix tests * Rename new option to updateSources and adjust description * Fix recursive test for node 10 * Fix tests for old node
1 parent e3427cb commit 5afb782

File tree

6 files changed

+125
-6
lines changed

6 files changed

+125
-6
lines changed

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ scrape(options, (error, result) => {
6464
* [onResourceError](#onresourceerror) - callback called when resource's downloading is failed
6565
* [updateMissingSources](#updatemissingsources) - update url for missing sources with absolute url
6666
* [requestConcurrency](#requestconcurrency) - set maximum concurrent requests
67+
* [updateSources](#updateSources) - set to false to keep all html content unmodified
6768

6869
Default options you can find in [lib/config/defaults.js](https://github.com/website-scraper/node-website-scraper/blob/master/lib/config/defaults.js) or get them using `scrape.defaults`.
6970

@@ -296,6 +297,12 @@ scrape({
296297
Number, maximum amount of concurrent requests. Defaults to `Infinity`.
297298

298299

300+
#### updateSources
301+
Boolean. Defaults to `true`. Use `false` when scraped site structure does not
302+
fit your custom filename generator or if you do not want html content to be
303+
modified in any way.
304+
305+
299306
## callback
300307
Callback function, optional, includes following parameters:
301308
- `error`: if error - `Error` object, if success - `null`

lib/config/defaults.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,8 @@ const config = {
5959
onResourceSaved: null,
6060
onResourceError: null,
6161
resourceSaver: null,
62-
updateMissingSources: false
62+
updateMissingSources: false,
63+
updateSources: true,
6364
};
6465

6566
module.exports = config;

lib/resource-handler/index.js

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ const utils = require('../utils');
88
const HtmlHandler = require('./html');
99
const CssHandler = require('./css');
1010

11-
const supportedOptions = ['prettifyUrls', 'sources', 'recursiveSources', 'maxRecursiveDepth', 'defaultFilename', 'updateMissingSources'];
11+
const supportedOptions = ['prettifyUrls', 'sources', 'recursiveSources', 'maxRecursiveDepth', 'defaultFilename', 'updateMissingSources', 'updateSources'];
1212

1313
class ResourceHandler {
1414
constructor (options, context) {
@@ -83,12 +83,15 @@ class ResourceHandler {
8383
});
8484

8585
return utils.waitAllFulfilled(childrenPromises).then(function updateChildrenPaths () {
86+
if (self.options.updateSources === false) {
87+
return pathContainer.updateText([]);
88+
}
8689
return pathContainer.updateText(pathsToUpdate);
8790
});
8891
}
8992

9093
updateChildrenResources (pathContainer, parentResource, needToUpdate) {
91-
if (!needToUpdate) {
94+
if (!needToUpdate || this.options.updateSources === false) {
9295
return Promise.resolve(pathContainer.updateText([]));
9396
}
9497
const parentUrl = parentResource.getUrl();
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8">
5+
<title>Data</title>
6+
</head>
7+
<body>
8+
<a href="http://example.com/about.html">About 1</a>
9+
<a href="//example.com/about.html">About 2</a>
10+
<a href="//about.html">About 3</a>
11+
<a href="../about.html">About 4</a>
12+
</body>
13+
</html>

test/functional/recursive/mocks/index.html

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,12 @@
55
<title>Title</title>
66
</head>
77
<body>
8-
<a href="/about.html"></a>
8+
<a href="/about.html">About</a>
9+
<a href="http://example.com/data/data.html">Data 1</a>
10+
<a href="//example.com/data/data.html">Data 2</a>
11+
<a href="//data/data.html">Data 3</a>
12+
<a href="/data/data.html">Data 4</a>
13+
<a href="/data/data/data.html">Data 5</a>
914

1015
</body>
1116
</html>

test/functional/recursive/recursive.test.js

Lines changed: 92 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@ var nock = require('nock');
33
var fs = require('fs-extra');
44
var scrape = require('../../../index');
55

6-
var testDirname = __dirname + '/.tmp';
76
var mockDirname = __dirname + '/mocks';
7+
var testDirname = __dirname + '/.tmp';
8+
var URL = require('url');
89

910
describe('Functional recursive downloading', function() {
1011

@@ -25,7 +26,8 @@ describe('Functional recursive downloading', function() {
2526
directory: testDirname,
2627
subdirectories: null,
2728
sources: [],
28-
recursive: true
29+
recursive: true,
30+
updateSources: true,
2931
};
3032

3133
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html');
@@ -35,6 +37,7 @@ describe('Functional recursive downloading', function() {
3537
nock('http://example.com/').get('/link1.html').reply(200, 'content 1');
3638
nock('http://example.com/').get('/link2.html').reply(200, 'content 2');
3739
nock('http://example.com/').get('/link3.html').reply(200, 'content 3');
40+
nock('http://example.com/').get('/data/data.html').replyWithFile(200, mockDirname + '/data.html');
3841

3942
return scrape(options).then(function() {
4043
fs.existsSync(testDirname + '/index.html').should.be.eql(true);
@@ -49,6 +52,93 @@ describe('Functional recursive downloading', function() {
4952
});
5053
});
5154

55+
it('should follow anchors if recursive flag is set and links not replaced', function () {
56+
var options = {
57+
urls: [ 'http://example.com/' ],
58+
directory: testDirname,
59+
subdirectories: null,
60+
sources: [],
61+
recursive: true,
62+
updateSources: false,
63+
};
64+
65+
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html');
66+
67+
// mock for anchors
68+
nock('http://example.com/').get('/about.html').replyWithFile(200, mockDirname + '/about.html');
69+
nock('http://example.com/').get('/link1.html').reply(200, 'content 1');
70+
nock('http://example.com/').get('/link2.html').reply(200, 'content 2');
71+
nock('http://example.com/').get('/link3.html').reply(200, 'content 3');
72+
nock('http://example.com/').get('/data/data.html').replyWithFile(200, mockDirname + '/data.html');
73+
74+
return scrape(options).then(function() {
75+
fs.existsSync(testDirname + '/index.html').should.be.eql(true);
76+
77+
fs.readFileSync(testDirname + '/data.html').toString().should.eql(
78+
fs.readFileSync(mockDirname + '/data.html').toString());
79+
80+
fs.readFileSync(testDirname + '/index.html').toString().should.eql(
81+
fs.readFileSync(mockDirname + '/index.html').toString());
82+
83+
// index.html anchors loaded
84+
fs.existsSync(testDirname + '/about.html').should.be.eql(true);
85+
86+
// about.html anchors loaded
87+
fs.existsSync(testDirname + '/link1.html').should.be.eql(true);
88+
fs.existsSync(testDirname + '/link2.html').should.be.eql(true);
89+
fs.existsSync(testDirname + '/link3.html').should.be.eql(true);
90+
});
91+
});
92+
93+
it('should follow anchors if recursive flag is set and custom filename generator follows exact site structure',
94+
function () {
95+
var generateFilename = function (url) {
96+
var parsedUrl = URL.parse(url);
97+
if (parsedUrl.pathname === '/') {
98+
return parsedUrl.hostname + parsedUrl.pathname + "/index.html";
99+
}
100+
return parsedUrl.hostname + parsedUrl.pathname;
101+
};
102+
var options = {
103+
urls: [ 'http://example.com/' ],
104+
directory: testDirname,
105+
subdirectories: null,
106+
sources: [],
107+
recursive: true,
108+
updateSources: false,
109+
filenameGenerator: (resource, options, occupiedFileNames) => {
110+
return generateFilename(resource.url);
111+
}
112+
};
113+
114+
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html');
115+
116+
// mock for anchors
117+
nock('http://example.com/').get('/about.html').replyWithFile(200, mockDirname + '/about.html');
118+
nock('http://example.com/').get('/link1.html').reply(200, 'content 1');
119+
nock('http://example.com/').get('/link2.html').reply(200, 'content 2');
120+
nock('http://example.com/').get('/link3.html').reply(200, 'content 3');
121+
nock('http://example.com/').get('/data/data.html').replyWithFile(200, mockDirname + '/data.html');
122+
123+
return scrape(options).then(function() {
124+
fs.existsSync(testDirname + '/example.com/index.html').should.be.eql(true);
125+
126+
fs.readFileSync(testDirname + '/example.com/data/data.html').toString().should.eql(
127+
fs.readFileSync(mockDirname + '/data.html').toString());
128+
129+
fs.readFileSync(testDirname + '/example.com/index.html').toString().should.eql(
130+
fs.readFileSync(mockDirname + '/index.html').toString());
131+
132+
// index.html anchors loaded
133+
fs.existsSync(testDirname + '/example.com/about.html').should.be.eql(true);
134+
135+
// about.html anchors loaded
136+
fs.existsSync(testDirname + '/example.com/link1.html').should.be.eql(true);
137+
fs.existsSync(testDirname + '/example.com/link2.html').should.be.eql(true);
138+
fs.existsSync(testDirname + '/example.com/link3.html').should.be.eql(true);
139+
});
140+
});
141+
52142
it('should follow anchors with depth <= maxDepth if recursive flag and maxDepth are set', function () {
53143
var options = {
54144
urls: [ 'http://example.com/' ],

0 commit comments

Comments
 (0)