@@ -40,7 +40,7 @@ function Scraper (options) {
4040
4141 self . occupiedFileNames = [ ] ; // Array of unavailable filenames
4242 self . respondedResourcePromises = { } ; // Map url -> request promise
43- self . loadedResourcePromises = { } ; // Map url -> save-to-fs promise
43+ self . loadedResources = { } ; // Map url -> resource
4444}
4545
4646Scraper . prototype . addOccupiedFileName = function addOccupiedFileName ( filename ) {
@@ -59,12 +59,12 @@ Scraper.prototype.getRespondedResourcePromise = function getRespondedResourcePro
5959 return this . respondedResourcePromises [ utils . normalizeUrl ( url ) ] ;
6060} ;
6161
62- Scraper . prototype . addLoadedResourcePromise = function addLoadedResourcePromise ( url , promise ) {
63- this . loadedResourcePromises [ utils . normalizeUrl ( url ) ] = promise ;
62+ Scraper . prototype . addLoadedResource = function addLoadedResourcePromise ( url , promise ) {
63+ this . loadedResources [ utils . normalizeUrl ( url ) ] = promise ;
6464} ;
6565
66- Scraper . prototype . getLoadedResourcePromise = function getLoadedResourcePromise ( url ) {
67- return this . loadedResourcePromises [ utils . normalizeUrl ( url ) ] ;
66+ Scraper . prototype . getLoadedResource = function getLoadedResourcePromise ( url ) {
67+ return this . loadedResources [ utils . normalizeUrl ( url ) ] ;
6868} ;
6969
7070Scraper . prototype . getHtmlSources = function getHtmlSources ( ) {
@@ -75,34 +75,38 @@ Scraper.prototype.loadResource = function loadResource (resource) {
7575 var self = this ;
7676 var url = resource . getUrl ( ) ;
7777
78- var loadedResourcePromise = self . getLoadedResourcePromise ( url ) ;
79- if ( loadedResourcePromise ) {
78+ var loadedResource = self . getLoadedResource ( url ) ;
79+ if ( loadedResource ) {
8080 logger . debug ( 'found loaded resource for ' + resource ) ;
8181 } else {
8282 logger . debug ( 'start loading resource ' + resource ) ;
83- loadedResourcePromise = Promise . resolve ( )
84- . then ( function handleResource ( ) {
85- var resourceHandler = self . getResourceHandler ( resource ) ;
86- return resourceHandler ( self , resource ) ;
87- } ) . then ( function fileHandled ( ) {
88- return self . fsAdapter . saveResource ( resource ) ;
89- } ) . catch ( function handleError ( err ) {
90- logger . warn ( 'failed to save resource ' + resource ) ;
91- return self . handleError ( err ) ;
92- } ) ;
93-
94- self . addLoadedResourcePromise ( url , loadedResourcePromise ) ;
83+ self . addLoadedResource ( url , resource ) ;
9584 }
85+ } ;
9686
97- return loadedResourcePromise ;
87+ Scraper . prototype . saveResource = function saveResource ( resource ) {
88+ var self = this ;
89+ resource . setSaved ( ) ;
90+
91+ return Promise . resolve ( )
92+ . then ( function handleResource ( ) {
93+ var resourceHandler = self . getResourceHandler ( resource ) ;
94+ return resourceHandler ( self , resource ) ;
95+ } ) . then ( function fileHandled ( ) {
96+ logger . info ( 'saving resource ' + resource + ' to fs' ) ;
97+ return self . fsAdapter . saveResource ( resource ) ;
98+ } ) . catch ( function handleError ( err ) {
99+ logger . warn ( 'failed to save resource ' + resource ) ;
100+ return self . handleError ( err ) ;
101+ } ) ;
98102} ;
99103
100104Scraper . prototype . requestResource = function requestResource ( resource ) {
101105 var self = this ;
102106 var url = resource . getUrl ( ) ;
103107
104108 if ( ! self . options . urlFilter ( url ) ) {
105- logger . info ( 'filtering out ' + resource + ' by url filter' ) ;
109+ logger . debug ( 'filtering out ' + resource + ' by url filter' ) ;
106110 return Promise . resolve ( null ) ;
107111 }
108112
@@ -121,6 +125,10 @@ Scraper.prototype.requestResource = function requestResource (resource) {
121125
122126 if ( ! utils . urlsEqual ( responseData . url , url ) ) { // Url may be changed in redirects
123127 logger . debug ( 'url changed. old url = ' + url + ', new ulr = ' + responseData . url ) ;
128+ var respondedNewUrlResource = self . getRespondedResourcePromise ( responseData . url ) ;
129+ if ( respondedNewUrlResource ) {
130+ return respondedNewUrlResource ;
131+ }
124132 resource . setUrl ( responseData . url ) ;
125133 self . addRespondedResourcePromise ( responseData . url , respondedResourcePromise ) ;
126134 }
@@ -155,23 +163,21 @@ Scraper.prototype.load = function load () {
155163 // Do not wait for loadResource here, to prevent deadlock, scraper.waitForLoad
156164 self . loadResource ( resOriginalResource ) ;
157165 } ) ;
158- } ) . then ( function afterAllOriginalResourcesResponded ( ) {
159- return self . waitForLoad ( 0 ) ;
160- } ) ;
166+ } ) . then ( self . waitForLoad . bind ( self ) ) ;
161167 } ) ;
162168} ;
163169
164170// Returns a promise which gets resolved when all resources are loaded.
165- // Determines whether loading is done by:
166- // 1. Waiting until all loadedResourcePromises are resolved.
167- // 2. Recursing itself if any new loadedResourcePromises (promises for the loading of childResources) where added during this time. If not, loading is done.
168- Scraper . prototype . waitForLoad = function waitForLoad ( previousCount ) {
171+ // 1. Get all not saved resources and save them
172+ // 2. Recursion if any new not saved resource were added during this time. If not, loading is done.
173+ Scraper . prototype . waitForLoad = function waitForLoad ( ) {
169174 var self = this ;
170- var count = _ . size ( self . loadedResourcePromises ) ;
171- var loadingIsFinished = ( count === previousCount ) ;
175+ var resourcesToSave = _ . filter ( self . loadedResources , function getNotSaved ( r ) { return ! r . isSaved ( ) ; } ) ;
176+ var loadingIsFinished = _ . isEmpty ( resourcesToSave ) ;
172177
173178 if ( ! loadingIsFinished ) {
174- return Promise . all ( _ . toArray ( self . loadedResourcePromises ) ) . then ( self . waitForLoad . bind ( self , count ) ) ;
179+ return Promise . mapSeries ( resourcesToSave , self . saveResource . bind ( self ) )
180+ . then ( self . waitForLoad . bind ( self ) ) ;
175181 }
176182 return Promise . resolve ( self . originalResources ) ;
177183} ;
0 commit comments