@@ -14,7 +14,7 @@ var Resource = require('./resource');
1414
1515var getFilenameGenerator = require ( './filename-generators/filename-generator-getter' ) ;
1616var makeRequest = require ( './request' ) ;
17- var compareUrls = require ( 'compare-urls ' ) ;
17+ var normalizeUrl = require ( 'normalize-url ' ) ;
1818
1919var loadHtml = require ( './file-handlers/html' ) ;
2020var loadCss = require ( './file-handlers/css' ) ;
@@ -26,24 +26,49 @@ function loadHtmlAndCss (context, po) {
2626
2727function Scraper ( options ) {
2828 this . originalResources = [ ] ;
29+ this . occupiedFileNames = [ ] ;
2930 this . loadedResources = [ ] ;
31+ this . respondedResourcePromises = { } ;
32+ this . loadedResourcePromises = { } ;
3033
3134 this . options = _ . extend ( { } , defaults , options ) ;
3235 this . options . request = _ . extend ( { } , defaults . request , options . request ) ;
3336 this . options . directory = path . resolve ( process . cwd ( ) , this . options . directory || '' ) ;
3437 this . options . filenameGenerator = getFilenameGenerator ( this . options . filenameGenerator ) ;
3538}
3639
37- Scraper . prototype . getLoadedResource = function getLoadedResource ( resource ) {
38- return _ . find ( this . loadedResources , function checkUrlsEqual ( lr ) {
39- return compareUrls ( resource . getUrl ( ) , lr . getUrl ( ) ) ;
40- } ) ;
40+ Scraper . prototype . addOccupiedFileName = function addOccupiedFileName ( filename ) {
41+ this . occupiedFileNames . push ( filename ) ;
42+ } ;
43+
44+ Scraper . prototype . getOccupiedFileNames = function getOccupiedFileNames ( ) {
45+ return this . occupiedFileNames ;
4146} ;
4247
4348Scraper . prototype . addLoadedResource = function addLoadedResource ( resource ) {
4449 this . loadedResources . push ( resource ) ;
4550} ;
4651
52+ Scraper . prototype . addRespondedResourcePromise = function addRespondedResourcePromise ( url , promise ) {
53+ url = normalizeUrl ( url ) ;
54+ this . respondedResourcePromises [ url ] = promise ;
55+ } ;
56+
57+ Scraper . prototype . getRespondedResourcePromise = function getRespondedResourcePromise ( url ) {
58+ url = normalizeUrl ( url ) ;
59+ return this . respondedResourcePromises [ url ] ;
60+ } ;
61+
62+ Scraper . prototype . addLoadedResourcePromise = function addLoadedResourcePromise ( url , promise ) {
63+ url = normalizeUrl ( url ) ;
64+ this . loadedResourcePromises [ url ] = promise ;
65+ } ;
66+
67+ Scraper . prototype . getLoadedResourcePromise = function getLoadedResourcePromise ( url ) {
68+ url = normalizeUrl ( url ) ;
69+ return this . loadedResourcePromises [ url ] ;
70+ } ;
71+
4772Scraper . prototype . getHtmlSources = function getHtmlSources ( ) {
4873 return this . options . sources ;
4974} ;
@@ -53,46 +78,81 @@ Scraper.prototype.getResourceHandler = function getHandler (resource) {
5378 var depth = resource . getDepth ( ) ;
5479 var depthGreaterThanMax = self . options . maxDepth && depth >= self . options . maxDepth ;
5580
56- switch ( true ) {
57- case depthGreaterThanMax : return _ . noop ;
58- case resource . isCss ( ) : return loadCss ;
59- case resource . isHtml ( ) : return loadHtmlAndCss ;
60- default : return _ . noop ;
81+ switch ( true ) {
82+ case depthGreaterThanMax :
83+ return _ . noop ;
84+ case resource . isCss ( ) :
85+ return loadCss ;
86+ case resource . isHtml ( ) :
87+ return loadHtmlAndCss ;
88+ default :
89+ return _ . noop ;
6190 }
6291} ;
6392
6493Scraper . prototype . loadResource = function loadResource ( resource ) {
6594 var self = this ;
95+ var url = resource . getUrl ( ) ;
6696
67- if ( ! self . options . urlFilter ( resource . url ) ) {
97+ if ( ! self . options . urlFilter ( url ) ) {
6898 return Promise . resolve ( null ) ;
6999 }
70100
71- // try to find already loaded
72- var loaded = self . getLoadedResource ( resource ) ;
101+ var loadedResourcePromise = self . getLoadedResourcePromise ( url ) ;
102+ if ( loadedResourcePromise ) {
103+ return loadedResourcePromise ;
104+ }
73105
74- if ( ! loaded ) {
75- var url = resource . getUrl ( ) ;
76- var filename = self . options . filenameGenerator ( resource , self . options , self . loadedResources ) ;
77- resource . setFilename ( filename ) ;
106+ var respondedResourcePromise = self . requestResource ( resource ) ;
107+
108+ loadedResourcePromise = respondedResourcePromise . then ( function handleResponse ( resource ) {
109+ return Promise . resolve ( )
110+ . then ( function handleResource ( ) {
111+ var resourceHandler = self . getResourceHandler ( resource ) ;
112+ return resourceHandler ( self , resource ) ;
113+ } ) . then ( function fileHandled ( ) {
114+ var filename = path . join ( self . options . directory , resource . getFilename ( ) ) ;
115+ var text = resource . getText ( ) ;
116+ return outputFileAsync ( filename , text , { encoding : 'binary' } ) ;
117+ } ) . then ( function fileSaved ( ) {
118+ self . addLoadedResourcePromise ( resource . getUrl ( ) , loadedResourcePromise ) ;
119+ self . addLoadedResource ( resource ) ;
120+ return resource ;
121+ } ) ;
122+ } ) ;
123+ self . addLoadedResourcePromise ( url , loadedResourcePromise ) ;
124+
125+ return loadedResourcePromise ;
126+ } ;
127+
128+ Scraper . prototype . requestResource = function requestResource ( resource ) {
129+ var self = this ;
130+ var url = resource . getUrl ( ) ;
131+
132+ if ( ! self . options . urlFilter ( url ) ) {
133+ return Promise . resolve ( null ) ;
134+ }
78135
79- self . addLoadedResource ( resource ) ;
80-
81- // Request -> processing -> save to fs
82- return self . makeRequest ( url ) . then ( function requestCompleted ( data ) {
83- resource . setUrl ( data . url ) ; // Url may be changed in redirects
84- resource . setText ( data . body ) ;
85- var handleFile = self . getResourceHandler ( resource ) ;
86- return handleFile ( self , resource ) ;
87- } ) . then ( function fileHandled ( ) {
88- var filename = path . join ( self . options . directory , resource . getFilename ( ) ) ;
89- var text = resource . getText ( ) ;
90- return outputFileAsync ( filename , text , { encoding : 'binary' } ) ;
91- } ) . then ( function fileSaved ( ) {
92- return Promise . resolve ( resource ) ;
93- } ) ;
136+ var respondedResourcePromise = self . getRespondedResourcePromise ( url ) ;
137+ if ( respondedResourcePromise ) {
138+ return respondedResourcePromise ;
94139 }
95- return Promise . resolve ( loaded ) ;
140+
141+ var promise = self . makeRequest ( url ) . then ( function requestCompleted ( data ) {
142+ // Url may be changed in redirects
143+ resource . setUrl ( data . url ) ;
144+
145+ var filename = self . options . filenameGenerator ( resource , self . options , self . getOccupiedFileNames ( ) ) ;
146+ resource . setFilename ( filename ) ;
147+ self . addOccupiedFileName ( filename ) ;
148+ self . addRespondedResourcePromise ( data . url , promise ) ;
149+
150+ resource . setText ( data . body ) ;
151+ return resource ;
152+ } ) ;
153+ self . addRespondedResourcePromise ( url , promise ) ;
154+
155+ return promise ;
96156} ;
97157
98158Scraper . prototype . validate = function validate ( ) {
@@ -118,7 +178,7 @@ Scraper.prototype.prepare = function prepare () {
118178 return new Resource ( url , filename ) ;
119179 } ) ;
120180
121- if ( self . options . recursive ) {
181+ if ( self . options . recursive ) {
122182 self . options . sources = _ . union ( self . options . sources , recursiveSources ) ;
123183 }
124184
@@ -127,9 +187,28 @@ Scraper.prototype.prepare = function prepare () {
127187
128188Scraper . prototype . load = function load ( ) {
129189 var self = this ;
130- return Promise . map ( self . originalResources , function loadResource ( resource ) {
131- return self . loadResource ( resource ) ;
190+ _ . forEach ( self . originalResources , function loadResource ( resource ) {
191+ self . loadResource ( resource ) ;
132192 } ) ;
193+ return self . waitForLoad ( 0 ) ;
194+ } ;
195+
196+ // Returns a promise which gets resolved when all resources are loaded.
197+ // Determines whether loading is done by:
198+ // 1. Waiting until all loadedResourcePromises are resolved.
199+ // 2. Recursing itself if any new loadedResourcePromises (promises for the loading of childResources) where added during this time. If not, loading is done.
200+ Scraper . prototype . waitForLoad = function waitForLoad ( previousCount ) {
201+ var self = this ;
202+ var count = _ . size ( self . loadedResourcePromises ) ;
203+ if ( count === previousCount ) {
204+ return Promise . all ( _ . map ( self . originalResources , function getLoadedResource ( resource ) {
205+ return self . getLoadedResourcePromise ( resource . getUrl ( ) ) ;
206+ } ) ) ;
207+ } else {
208+ return Promise
209+ . all ( _ . toArray ( self . loadedResourcePromises ) )
210+ . then ( self . waitForLoad . bind ( self , count ) ) ;
211+ }
133212} ;
134213
135214Scraper . prototype . errorCleanup = function errorCleanup ( error ) {
0 commit comments