2
2
/*jshint node: true */
3
3
/*jshint esversion: 6 */
4
4
/*jshint latedef: false */
5
+ /* jshint expr: true */
5
6
"use strict" ;
6
7
const request = require ( 'request' ) ;
7
8
const fs = require ( 'fs' ) ;
@@ -11,158 +12,144 @@ const utilities = require('./utilities');
11
12
const debug = require ( 'debug' ) ( 'spider' ) ;
12
13
debug . enabled = false ;
13
14
var downloaded = false ;
14
- var spidering = new Map ( ) ;
15
+ var spidering = new Map ( ) ;
15
16
var errors = [ ] ;
16
- var url , concurrency ;
17
- function saveFile ( filename , body , callback )
18
- {
19
- mkdirp ( path . dirname ( filename ) , err => {
17
+ var url , concurrency ;
18
+
19
+ function saveFile ( filename , body , callback ) {
20
+ mkdirp ( path . dirname ( filename ) , err => {
21
+ if ( err )
22
+ return callback ( err ) ;
23
+ fs . writeFile ( filename , body , err => {
20
24
if ( err )
21
- return callback ( err ) ;
22
- fs . writeFile ( filename , body , err => {
23
- if ( err )
24
- return callback ( err ) ;
25
- return callback ( null , filename , true ) ;
26
- } ) ;
25
+ return callback ( err ) ;
26
+ return callback ( null , filename , true ) ;
27
+ } ) ;
27
28
} ) ;
28
29
}
29
30
30
- function download ( url , filename , callback )
31
- {
32
- console . log ( `Downloading ${ url } to ${ filename } ` ) ;
33
- request ( url , ( err , response , body ) => { if ( err )
34
- return callback ( err ) ;
35
- saveFile ( filename , body , err => {
36
- console . log ( "Downloaded and saved " + url + " to ${filename}" ) ;
37
- if ( err )
38
- return callback ( err ) ;
39
- callback ( null , body ) ;
31
+ function download ( url , filename , callback ) {
32
+ console . log ( `Downloading ${ url } to ${ filename } ` ) ;
33
+ request ( url , ( err , response , body ) => {
34
+ if ( err )
35
+ return callback ( err ) ;
36
+ saveFile ( filename , body , err => {
37
+ console . log ( `Downloaded and saved ${ url } to ${ filename } ` ) ;
38
+ if ( err )
39
+ return callback ( err ) ;
40
+ callback ( null , body ) ;
41
+ } ) ;
40
42
} ) ;
41
- } ) ;
42
- downloaded = true ;
43
+ downloaded = true ;
43
44
}
44
45
45
- function spider ( url , nesting , callback )
46
- {
47
- if ( spidering . has ( url ) )
48
- return process . nextTick ( callback ) ;
49
- spidering . set ( url , true ) ;
50
- const filename = utilities . urlToFilename ( url ) ;
51
- fs . readFile ( filename , 'utf8' , function ( err , body ) {
52
- if ( err ) {
53
- if ( err . code !== 'ENOENT' )
54
- return callback ( err , filename , false ) ;
55
- return download ( url , filename , function ( err , body ) {
56
- if ( err )
57
- return callback ( err , filename , false ) ;
58
- spiderLinks ( url , body , nesting , callback ) ;
59
-
60
- } ) ;
61
- }
62
-
63
- spiderLinks ( url , body , nesting , callback ) ;
64
-
65
-
66
- } ) ;
46
+ function spider ( url , nesting , callback ) {
47
+ if ( spidering . has ( url ) )
48
+ return process . nextTick ( callback ) ;
49
+ spidering . set ( url , true ) ;
50
+ const filename = utilities . urlToFilename ( url ) ;
51
+ fs . readFile ( filename , 'utf8' , function ( err , body ) {
52
+ if ( err ) {
53
+ if ( err . code !== 'ENOENT' )
54
+ return callback ( err , filename , false ) ;
55
+ return download ( url , filename , function ( err , body ) {
56
+ if ( err )
57
+ return callback ( err , filename , false ) ;
58
+ spiderLinks ( url , body , nesting , callback ) ;
59
+
60
+ } ) ;
61
+ }
62
+
63
+ spiderLinks ( url , body , nesting , callback ) ;
64
+
65
+
66
+ } ) ;
67
67
}
68
-
69
- function spiderLinks ( currentUrl , body , nesting , callback ) {
70
- if ( nesting === 0 )
71
- return process . nextTick ( callback , null , currentUrl , downloaded ) ;
72
-
73
- var links = utilities . getPageLinks ( currentUrl , body ) ;
74
- if ( links . length === 0 )
75
- return process . nextTick ( callback , null , currentUrl , downloaded ) ;
76
- let completed = 0 ;
77
- let running = 0 ;
78
- let index = 0 ;
79
- let inError = false ;
80
- let error = null ;
81
- function done ( err )
82
- {
83
- if ( err )
84
- {
85
- inError = true ;
86
- return callback ( err ) ;
87
- }
88
- return callback ( null , url , downloaded ) ;
89
- }
90
- function next ( ) {
91
- while ( running < concurrency && index < links . length ) {
92
- const link = links [ index ++ ] ;
93
-
94
- spider ( link , nesting - 1 , function ( err ) {
95
- if ( err )
96
- {
97
- inError = true ;
98
- error = err ;
99
- return callback ( err ) ;
100
- }
101
- if ( completed === links . length && ! inError )
102
- return done ( ) ;
103
- completed ++ , running -- ; next ( ) ;
104
- } ) ;
105
- running ++ ;
106
- }
107
- if ( completed === links . length && ! inError )
108
- return done ( ) ;
109
- }
110
- next ( ) ;
111
68
69
+ function spiderLinks ( currentUrl , body , nesting , callback ) {
70
+ if ( nesting === 0 )
71
+ return process . nextTick ( callback , null , currentUrl , downloaded ) ;
72
+
73
+ var links = utilities . getPageLinks ( currentUrl , body ) ;
74
+ if ( links . length === 0 )
75
+ return process . nextTick ( callback , null , currentUrl , downloaded ) ;
76
+ let completed = 0 ;
77
+ let running = 0 ;
78
+ let index = 0 ;
79
+ let inError = false ;
80
+ let error = null ;
81
+
82
+ function done ( err ) {
83
+ if ( err ) {
84
+ inError = true ;
85
+ return callback ( err ) ;
86
+ }
87
+ return callback ( null , url , downloaded ) ;
88
+ }
89
+
90
+ function spidered ( err ) {
91
+ if ( err ) {
92
+ inError = true ;
93
+ error = err ;
94
+ return callback ( err ) ;
95
+ }
96
+ if ( completed === links . length && ! inError )
97
+ return done ( ) ;
98
+ completed ++ , running -- ;
99
+ next ( ) ;
100
+ }
101
+
102
+ function next ( ) {
103
+ while ( running < concurrency && index < links . length ) {
104
+ const link = links [ index ++ ] ;
105
+ spider ( link , nesting - 1 , spidered ) ;
106
+ running ++ ;
107
+ }
108
+ if ( completed === links . length && ! inError )
109
+ return done ( ) ;
110
+ }
111
+ next ( ) ;
112
112
}
113
113
114
- function exitMessage ( )
115
- {
114
+ function exitMessage ( ) {
116
115
console . error ( 'Usage: node spider.js url {level} {concurrency}.\nLevel defaults to 1.\nConcurrency defaults to 2.' ) ;
117
116
process . exit ( 1 ) ;
118
117
}
119
118
120
119
url = process . argv [ 2 ] ;
121
120
var level ;
122
- if ( process . argv [ 3 ] )
123
- {
124
- level = parseInt ( process . argv [ 3 ] ) ;
125
-
126
- if ( isNaN ( level ) || level <= 0 )
127
- exitMessage ( ) ;
128
- }
129
- else
130
- level = 1 ;
131
- if ( process . argv [ 4 ] )
132
- {
133
- concurrency = parseInt ( process . argv [ 4 ] ) ;
134
-
135
- if ( isNaN ( level ) || concurrency <= 0 )
136
- exitMessage ( ) ;
137
- }
138
- else
139
- concurrency = 2 ;
140
- if ( url )
141
- {
142
- spider ( url , level , ( err , filename , downloaded ) => {
143
-
144
- if ( err ) {
145
- console . log ( err ) ;
146
- errors . push ( err ) ;
147
- } else if ( downloaded ) {
148
- console . log ( `Completed the download of "${ url } "` ) ;
149
-
150
- } else {
151
- console . log ( `"${ url } " has already been downloaded` ) ;
152
- }
153
- if ( errors . length )
154
- {
155
- console . log ( "Check errors. Redownload if necessary." ) ;
156
- errors . forEach ( function ( error )
157
- {
158
- console . log ( error . name + ':' + error . message ) ;
159
- } ) ;
160
- }
161
- } ) ;
162
- }
163
- else
164
- exitMessage ( ) ;
121
+ if ( process . argv [ 3 ] ) {
122
+ level = parseInt ( process . argv [ 3 ] ) ;
165
123
124
+ if ( isNaN ( level ) || level <= 0 )
125
+ exitMessage ( ) ;
126
+ } else
127
+ level = 1 ;
128
+ if ( process . argv [ 4 ] ) {
129
+ concurrency = parseInt ( process . argv [ 4 ] ) ;
166
130
131
+ if ( isNaN ( level ) || concurrency <= 0 )
132
+ exitMessage ( ) ;
133
+ } else
134
+ concurrency = 2 ;
135
+ if ( url ) {
136
+ spider ( url , level , ( err , filename , downloaded ) => {
167
137
138
+ if ( err ) {
139
+ console . log ( err ) ;
140
+ errors . push ( err ) ;
141
+ } else if ( downloaded ) {
142
+ console . log ( `Completed the download of "${ url } "` ) ;
168
143
144
+ } else {
145
+ console . log ( `"${ url } " has already been downloaded` ) ;
146
+ }
147
+ if ( errors . length ) {
148
+ console . log ( "Check errors. Redownload if necessary." ) ;
149
+ errors . forEach ( function ( error ) {
150
+ console . log ( error . name + ':' + error . message ) ;
151
+ } ) ;
152
+ }
153
+ } ) ;
154
+ } else
155
+ exitMessage ( ) ;
0 commit comments