@@ -12,6 +12,7 @@ var makeRequest = require('./request');
1212var ResourceHandler = require ( './resource-handler' ) ;
1313var FSAdapter = require ( './fs-adaper' ) ;
1414var utils = require ( './utils' ) ;
15+ var NormalizedUrlMap = require ( './utils/normalized-url-map' ) ;
1516
1617function Scraper ( options ) {
1718 var self = this ;
@@ -40,36 +41,19 @@ function Scraper (options) {
4041 return new Resource ( url , filename ) ;
4142 } ) ;
4243
43- self . respondedResourcePromises = { } ; // Map url -> request promise
44- self . loadedResources = { } ; // Map url -> resource
44+ self . requestedResourcePromises = new NormalizedUrlMap ( ) ; // Map url -> request promise
45+ self . loadedResources = new NormalizedUrlMap ( ) ; // Map url -> resource
4546}
4647
47- Scraper . prototype . addRespondedResourcePromise = function addRespondedResourcePromise ( url , promise ) {
48- this . respondedResourcePromises [ utils . normalizeUrl ( url ) ] = promise ;
49- } ;
50-
51- Scraper . prototype . getRespondedResourcePromise = function getRespondedResourcePromise ( url ) {
52- return this . respondedResourcePromises [ utils . normalizeUrl ( url ) ] ;
53- } ;
54-
55- Scraper . prototype . addLoadedResource = function addLoadedResourcePromise ( url , promise ) {
56- this . loadedResources [ utils . normalizeUrl ( url ) ] = promise ;
57- } ;
58-
59- Scraper . prototype . getLoadedResource = function getLoadedResourcePromise ( url ) {
60- return this . loadedResources [ utils . normalizeUrl ( url ) ] ;
61- } ;
62-
6348Scraper . prototype . loadResource = function loadResource ( resource ) {
6449 var self = this ;
6550 var url = resource . getUrl ( ) ;
6651
67- var loadedResource = self . getLoadedResource ( url ) ;
68- if ( loadedResource ) {
52+ if ( self . loadedResources . has ( url ) ) {
6953 logger . debug ( 'found loaded resource for ' + resource ) ;
7054 } else {
71- logger . debug ( 'start loading resource ' + resource ) ;
72- self . addLoadedResource ( url , resource ) ;
55+ logger . debug ( 'add loaded resource ' + resource ) ;
56+ self . loadedResources . set ( url , resource ) ;
7357 }
7458} ;
7559
@@ -89,63 +73,69 @@ Scraper.prototype.saveResource = function saveResource (resource) {
8973 } ) ;
9074} ;
9175
92- Scraper . prototype . requestResource = function requestResource ( resource ) {
76+ Scraper . prototype . createNewRequest = function createNewRequest ( resource ) {
9377 var self = this ;
9478 var url = resource . getUrl ( ) ;
9579
96- if ( ! self . options . urlFilter ( url ) ) {
97- logger . debug ( 'filtering out ' + resource + ' by url filter' ) ;
98- return Promise . resolve ( null ) ;
99- }
100-
101- if ( self . options . maxDepth && resource . getDepth ( ) > self . options . maxDepth ) {
102- logger . debug ( 'filtering out ' + resource + ' by depth' ) ;
103- return Promise . resolve ( null ) ;
104- }
105-
106- var respondedResourcePromise = self . getRespondedResourcePromise ( url ) ;
107- if ( respondedResourcePromise ) {
108- logger . debug ( 'found responded resource for ' + resource ) ;
109- return respondedResourcePromise ;
110- }
111-
112- respondedResourcePromise = Promise . resolve ( )
80+ var requestPromise = Promise . resolve ( )
11381 . then ( function makeRequest ( ) {
11482 var referer = resource . parent ? resource . parent . getUrl ( ) : null ;
11583 return self . makeRequest ( url , referer ) ;
11684 } ) . then ( function requestCompleted ( responseData ) {
11785
11886 if ( ! utils . urlsEqual ( responseData . url , url ) ) { // Url may be changed in redirects
11987 logger . debug ( 'url changed. old url = ' + url + ', new url = ' + responseData . url ) ;
120- var respondedNewUrlResource = self . getRespondedResourcePromise ( responseData . url ) ;
121- if ( respondedNewUrlResource ) {
122- return respondedNewUrlResource ;
88+
89+ if ( self . requestedResourcePromises . has ( responseData . url ) ) {
90+ return self . requestedResourcePromises . get ( responseData . url ) ;
12391 }
92+
12493 resource . setUrl ( responseData . url ) ;
125- self . addRespondedResourcePromise ( responseData . url , respondedResourcePromise ) ;
94+ self . requestedResourcePromises . set ( responseData . url , requestPromise ) ;
12695 }
12796
12897 resource . setType ( utils . getTypeByMime ( responseData . mimeType ) ) ;
12998
13099 var filename = self . filenameGenerator . generateFilename ( resource ) ;
131100 resource . setFilename ( filename ) ;
132101
133- // if type was not determined by mime
134- // we can try to get it from filename after it was generated
102+ // if type was not determined by mime we can try to get it from filename after it was generated
135103 if ( ! resource . getType ( ) ) {
136104 resource . setType ( utils . getTypeByFilename ( filename ) ) ;
137105 }
138106
139107 resource . setText ( responseData . body ) ;
108+ self . loadResource ( resource ) ; // Add resource to list for future downloading, see Scraper.waitForLoad
140109 return resource ;
141110 } ) . catch ( function handleError ( err ) {
142111 logger . warn ( 'failed to request resource ' + resource ) ;
143112 return self . handleError ( err ) ;
144113 } ) ;
145114
146- self . addRespondedResourcePromise ( url , respondedResourcePromise ) ;
115+ self . requestedResourcePromises . set ( url , requestPromise ) ;
116+ return requestPromise ;
117+ } ;
147118
148- return respondedResourcePromise ;
119+ Scraper . prototype . requestResource = function requestResource ( resource ) {
120+ var self = this ;
121+ var url = resource . getUrl ( ) ;
122+
123+ if ( ! self . options . urlFilter ( url ) ) {
124+ logger . debug ( 'filtering out ' + resource + ' by url filter' ) ;
125+ return Promise . resolve ( null ) ;
126+ }
127+
128+ if ( self . options . maxDepth && resource . getDepth ( ) > self . options . maxDepth ) {
129+ logger . debug ( 'filtering out ' + resource + ' by depth' ) ;
130+ return Promise . resolve ( null ) ;
131+ }
132+
133+ if ( self . requestedResourcePromises . has ( url ) ) {
134+ logger . debug ( 'found requested resource for ' + resource ) ;
135+ return self . requestedResourcePromises . get ( url ) ;
136+ }
137+
138+ return self . createNewRequest ( resource ) ;
149139} ;
150140
151141Scraper . prototype . validate = function validate ( ) {
@@ -155,23 +145,16 @@ Scraper.prototype.validate = function validate () {
155145Scraper . prototype . load = function load ( ) {
156146 var self = this ;
157147 return self . fsAdapter . createDirectory ( ) . then ( function loadAllResources ( ) {
158- return Promise . map ( self . originalResources , function loadResource ( originalResource ) {
159- return self . requestResource ( originalResource ) . then ( function receivedResponse ( resOriginalResource ) {
160- if ( resOriginalResource ) {
161- // Do not wait for loadResource here, to prevent deadlock, scraper.waitForLoad
162- self . loadResource ( resOriginalResource ) ;
163- }
164- } ) ;
165- } ) . then ( self . waitForLoad . bind ( self ) ) ;
166- } ) ;
148+ return Promise . map ( self . originalResources , self . requestResource . bind ( self ) ) ;
149+ } ) . then ( self . waitForLoad . bind ( self ) ) ;
167150} ;
168151
169152// Returns a promise which gets resolved when all resources are loaded.
170153// 1. Get all not saved resources and save them
171154// 2. Recursion if any new not saved resource were added during this time. If not, loading is done.
172155Scraper . prototype . waitForLoad = function waitForLoad ( ) {
173156 var self = this ;
174- var resourcesToSave = _ . filter ( self . loadedResources , ( r ) => ! r . isSaved ( ) ) ;
157+ var resourcesToSave = Array . from ( self . loadedResources . values ( ) ) . filter ( ( r ) => ! r . isSaved ( ) ) ;
175158 var loadingIsFinished = _ . isEmpty ( resourcesToSave ) ;
176159
177160 if ( ! loadingIsFinished ) {
0 commit comments