Add updateSources option (#319)

s0ph1e · web-flow · commit 5afb782e2f43 · 2018-10-31T23:20:26.000+02:00
* replaceLinks option added

* fix problem with not using self intead of this

* oops, forgot to return.

* fix tests

* Rename new option to updateSources and adjust description

* Fix recursive test for node 10

* Fix tests for old node
diff --git a/README.md b/README.md
@@ -64,6 +64,7 @@ scrape(options, (error, result) => {
 * [onResourceError](#onresourceerror) - callback called when resource's downloading is failed
 * [updateMissingSources](#updatemissingsources) - update url for missing sources with absolute url
 * [requestConcurrency](#requestconcurrency) - set maximum concurrent requests
+* [updateSources](#updateSources) - set to false to keep all html content unmodified
  
 Default options you can find in [lib/config/defaults.js](https://github.com/website-scraper/node-website-scraper/blob/master/lib/config/defaults.js) or get them using `scrape.defaults`.
 
@@ -296,6 +297,12 @@ scrape({
 Number, maximum amount of concurrent requests. Defaults to `Infinity`.
 
 
+#### updateSources
+Boolean. Defaults to `true`. Use `false` when scraped site structure does not
+fit your custom filename generator or if you do not want html content to be
+modified in any way.
+
+
 ## callback 
 Callback function, optional, includes following parameters:
   - `error`: if error - `Error` object, if success - `null`
diff --git a/lib/config/defaults.js b/lib/config/defaults.js
@@ -59,7 +59,8 @@ const config = {
 	onResourceSaved: null,
 	onResourceError: null,
 	resourceSaver: null,
-	updateMissingSources: false
+	updateMissingSources: false,
+	updateSources: true,
 };
 
 module.exports = config;
diff --git a/lib/resource-handler/index.js b/lib/resource-handler/index.js
@@ -8,7 +8,7 @@ const utils = require('../utils');
 const HtmlHandler = require('./html');
 const CssHandler = require('./css');
 
-const supportedOptions = ['prettifyUrls', 'sources', 'recursiveSources', 'maxRecursiveDepth', 'defaultFilename', 'updateMissingSources'];
+const supportedOptions = ['prettifyUrls', 'sources', 'recursiveSources', 'maxRecursiveDepth', 'defaultFilename', 'updateMissingSources', 'updateSources'];
 
 class ResourceHandler {
 	constructor (options, context) {
@@ -83,12 +83,15 @@ class ResourceHandler {
 		});
 
 		return utils.waitAllFulfilled(childrenPromises).then(function updateChildrenPaths () {
+			if (self.options.updateSources === false) {
+				return pathContainer.updateText([]);
+			}
 			return pathContainer.updateText(pathsToUpdate);
 		});
 	}
 
 	updateChildrenResources (pathContainer, parentResource, needToUpdate) {
-		if (!needToUpdate) {
+		if (!needToUpdate || this.options.updateSources === false) {
 			return Promise.resolve(pathContainer.updateText([]));
 		}
 		const parentUrl = parentResource.getUrl();
diff --git a/test/functional/recursive/mocks/data.html b/test/functional/recursive/mocks/data.html
@@ -0,0 +1,13 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title>Data</title>
+</head>
+<body>
+        <a href="http://example.com/about.html">About 1</a>
+        <a href="//example.com/about.html">About 2</a>
+        <a href="//about.html">About 3</a>
+        <a href="../about.html">About 4</a>
+</body>
+</html>
diff --git a/test/functional/recursive/mocks/index.html b/test/functional/recursive/mocks/index.html
@@ -5,7 +5,12 @@
     <title>Title</title>
 </head>
 <body>
-<a href="/about.html"></a>
+<a href="/about.html">About</a>
+<a href="http://example.com/data/data.html">Data 1</a>
+<a href="//example.com/data/data.html">Data 2</a>
+<a href="//data/data.html">Data 3</a>
+<a href="/data/data.html">Data 4</a>
+<a href="/data/data/data.html">Data 5</a>
 
 </body>
 </html>
diff --git a/test/functional/recursive/recursive.test.js b/test/functional/recursive/recursive.test.js
@@ -3,8 +3,9 @@ var nock = require('nock');
 var fs = require('fs-extra');
 var scrape = require('../../../index');
 
-var testDirname = __dirname + '/.tmp';
 var mockDirname = __dirname + '/mocks';
+var testDirname = __dirname + '/.tmp';
+var URL = require('url');
 
 describe('Functional recursive downloading', function() {
 
@@ -25,7 +26,8 @@ describe('Functional recursive downloading', function() {
 			directory: testDirname,
 			subdirectories: null,
 			sources: [],
-			recursive: true
+			recursive: true,
+			updateSources: true,
 		};
 
 		nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html');
@@ -35,6 +37,7 @@ describe('Functional recursive downloading', function() {
 		nock('http://example.com/').get('/link1.html').reply(200, 'content 1');
 		nock('http://example.com/').get('/link2.html').reply(200, 'content 2');
 		nock('http://example.com/').get('/link3.html').reply(200, 'content 3');
+		nock('http://example.com/').get('/data/data.html').replyWithFile(200, mockDirname + '/data.html');
 
 		return scrape(options).then(function() {
 			fs.existsSync(testDirname + '/index.html').should.be.eql(true);
@@ -49,6 +52,93 @@ describe('Functional recursive downloading', function() {
 		});
 	});
 
+	it('should follow anchors if recursive flag is set and links not replaced', function () {
+		var options = {
+			urls: [ 'http://example.com/' ],
+			directory: testDirname,
+			subdirectories: null,
+			sources: [],
+			recursive: true,
+			updateSources: false,
+		};
+
+		nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html');
+
+		// mock for anchors
+		nock('http://example.com/').get('/about.html').replyWithFile(200, mockDirname + '/about.html');
+		nock('http://example.com/').get('/link1.html').reply(200, 'content 1');
+		nock('http://example.com/').get('/link2.html').reply(200, 'content 2');
+		nock('http://example.com/').get('/link3.html').reply(200, 'content 3');
+		nock('http://example.com/').get('/data/data.html').replyWithFile(200, mockDirname + '/data.html');
+
+		return scrape(options).then(function() {
+			fs.existsSync(testDirname + '/index.html').should.be.eql(true);
+
+			fs.readFileSync(testDirname + '/data.html').toString().should.eql(
+				fs.readFileSync(mockDirname + '/data.html').toString());
+
+			fs.readFileSync(testDirname + '/index.html').toString().should.eql(
+				fs.readFileSync(mockDirname + '/index.html').toString());
+
+			// index.html anchors loaded
+			fs.existsSync(testDirname + '/about.html').should.be.eql(true);
+
+			// about.html anchors loaded
+			fs.existsSync(testDirname + '/link1.html').should.be.eql(true);
+			fs.existsSync(testDirname + '/link2.html').should.be.eql(true);
+			fs.existsSync(testDirname + '/link3.html').should.be.eql(true);
+		});
+	});
+
+	it('should follow anchors if recursive flag is set and custom filename generator follows exact site structure',
+	    function () {
+		var generateFilename = function (url) {
+			var parsedUrl = URL.parse(url);
+			if (parsedUrl.pathname === '/') {
+				return parsedUrl.hostname + parsedUrl.pathname + "/index.html";
+			}
+			return parsedUrl.hostname + parsedUrl.pathname;
+		};
+		var options = {
+			urls: [ 'http://example.com/' ],
+			directory: testDirname,
+			subdirectories: null,
+			sources: [],
+			recursive: true,
+			updateSources: false,
+			filenameGenerator: (resource, options, occupiedFileNames) => {
+				return generateFilename(resource.url);
+			}
+		};
+
+		nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html');
+
+		// mock for anchors
+		nock('http://example.com/').get('/about.html').replyWithFile(200, mockDirname + '/about.html');
+		nock('http://example.com/').get('/link1.html').reply(200, 'content 1');
+		nock('http://example.com/').get('/link2.html').reply(200, 'content 2');
+		nock('http://example.com/').get('/link3.html').reply(200, 'content 3');
+		nock('http://example.com/').get('/data/data.html').replyWithFile(200, mockDirname + '/data.html');
+
+		return scrape(options).then(function() {
+			fs.existsSync(testDirname + '/example.com/index.html').should.be.eql(true);
+
+			fs.readFileSync(testDirname + '/example.com/data/data.html').toString().should.eql(
+				fs.readFileSync(mockDirname + '/data.html').toString());
+
+			fs.readFileSync(testDirname + '/example.com/index.html').toString().should.eql(
+				fs.readFileSync(mockDirname + '/index.html').toString());
+
+			// index.html anchors loaded
+			fs.existsSync(testDirname + '/example.com/about.html').should.be.eql(true);
+
+			// about.html anchors loaded
+			fs.existsSync(testDirname + '/example.com/link1.html').should.be.eql(true);
+			fs.existsSync(testDirname + '/example.com/link2.html').should.be.eql(true);
+			fs.existsSync(testDirname + '/example.com/link3.html').should.be.eql(true);
+		});
+	});
+
 	it('should follow anchors with depth <= maxDepth if recursive flag and maxDepth are set', function () {
 		var options = {
 			urls: [ 'http://example.com/' ],