3434# -v $(repo-root-dir.sh):/app \
3535# ${DEV_ENV_DOCKER_IMAGE} \
3636# /bin/bash
37- # root@fb740b14ab66:/app#
37+ # root@fb740b14ab66:/app# int-test-run-all-spiders-in-ci-pipeline.py \
38+ # --max-num-spiders-to-run 1 \
39+ # --max-num-seconds-spiders-run 30 \
40+ # --log=info \
41+ # /app/dave \
42+ # simonsdave/gaming-spiders:bindle
3843#
3944
4045import datetime
4146import json
47+ import logging
48+ import optparse
4249import os
50+ import re
4351import sys
4452import subprocess
4553import time
4654
4755import dateutil .parser
4856
57+ import cloudfeaster
58+
59+ _logger = logging .getLogger (__name__ )
60+
61+
62+ def _check_logging_level (option , opt , value ):
63+ """Type checking function for command line parser's 'logginglevel' type."""
64+ reg_ex_pattern = "^(DEBUG|INFO|WARNING|ERROR|CRITICAL)$"
65+ reg_ex = re .compile (reg_ex_pattern , re .IGNORECASE )
66+ if reg_ex .match (value ):
67+ return getattr (logging , value .upper ())
68+ fmt = (
69+ "option %s: should be one of "
70+ "DEBUG, INFO, WARNING, ERROR or CRITICAL"
71+ )
72+ raise optparse .OptionValueError (fmt % opt )
73+
74+
75+ class CommandLineOption (optparse .Option ):
76+ """Adds new option types to the command line parser's base option types."""
77+ new_types = (
78+ 'logginglevel' ,
79+ )
80+ TYPES = optparse .Option .TYPES + new_types
81+ TYPE_CHECKER = optparse .Option .TYPE_CHECKER .copy ()
82+ TYPE_CHECKER ['logginglevel' ] = _check_logging_level
83+
84+
85+ class CommandLineParser (optparse .OptionParser ):
86+
87+ def __init__ (self ):
88+
89+ optparse .OptionParser .__init__ (
90+ self ,
91+ 'usage: %prog [options] <output-dir> <docker-image>' ,
92+ description = 'discover spiders' ,
93+ version = '%%prog %s' % cloudfeaster .__version__ ,
94+ option_class = CommandLineOption )
95+
96+ default = 1
97+ fmt = '# spiders to run @ same time - default = {default}'
98+ help = fmt .format (default = default )
99+ self .add_option (
100+ '--max-num-spiders-to-run' ,
101+ action = 'store' ,
102+ type = 'int' ,
103+ dest = 'max_number_spiders_to_run' ,
104+ default = default ,
105+ help = help )
106+
107+ default = 60
108+ fmt = 'max # seconds to run spider - default = {default}'
109+ help = fmt .format (default = default )
110+ self .add_option (
111+ '--max-num-seconds-spiders-run' ,
112+ action = 'store' ,
113+ type = 'int' ,
114+ dest = 'max_seconds_spiders_to_run' ,
115+ default = default ,
116+ help = help )
117+
118+ default = logging .ERROR
119+ fmt = (
120+ "logging level [DEBUG,INFO,WARNING,ERROR,CRITICAL] - "
121+ "default = %s"
122+ )
123+ help = fmt % logging .getLevelName (default )
124+ self .add_option (
125+ "--log" ,
126+ action = "store" ,
127+ dest = "logging_level" ,
128+ default = default ,
129+ type = "logginglevel" ,
130+ help = help )
131+
132+ def parse_args (self , * args , ** kwargs ):
133+ (clo , cla ) = optparse .OptionParser .parse_args (self , * args , ** kwargs )
134+ if len (cla ) != 2 :
135+ self .error ('output dir & docker image are required' )
136+
137+ return (clo , cla )
138+
49139
50140class SpidersContainer (object ):
51141
@@ -65,21 +155,22 @@ def spiders(self):
65155 all_the_metadata = json .loads (subprocess .check_output (args ).decode ('UTF-8' ).strip ())
66156 del all_the_metadata ['_metadata' ]
67157
68- filenames = set ()
158+ filenames_by_spider_name = {}
69159
70160 for (category , spiders ) in all_the_metadata .items ():
71161 for (name , metadata ) in spiders .items ():
72- filenames . add ( metadata ['absoluteFilename' ])
162+ filenames_by_spider_name [ name ] = metadata ['absoluteFilename' ]
73163
74- return list ( filenames )
164+ return filenames_by_spider_name
75165
76166
77167class CrawlContainer (object ):
78168
79- def __init__ (self , spider , docker_image ):
169+ def __init__ (self , spider , absolute_filename , docker_image ):
80170 object .__init__ (self )
81171
82172 self .spider = spider
173+ self .absolute_filename = absolute_filename
83174 self .docker_image = docker_image
84175
85176 self .container_id = None
@@ -93,7 +184,7 @@ def start(self):
93184 'run' ,
94185 '-d' ,
95186 self .docker_image ,
96- self .spider ,
187+ self .absolute_filename ,
97188 ]
98189 self .container_id = subprocess .check_output (args ).decode ('UTF-8' ).strip ()
99190
@@ -157,7 +248,10 @@ def number_seconds_running(self):
157248 return (now - start_date ).total_seconds ()
158249
159250 def save_output (self , output_dir , output = None ):
160- spider_output_dir = os .path .join (output_dir , os .path .splitext (self .spider )[0 ])
251+ spider_output_dir = os .path .join (
252+ output_dir ,
253+ self .spider )
254+ # os.makedirs() will throw an exception if there's an error
161255 os .makedirs (spider_output_dir )
162256
163257 if not output :
@@ -192,44 +286,70 @@ def _copy_debug_file(self, output, debug_file_property, spider_output_dir, debug
192286
193287
194288if __name__ == "__main__" :
195- if len (sys .argv ) != 5 :
196- fmt = "usage: {app} <#-spiders-2-run-at-same-time> <max-secs-for-spider-to-run> <output-dir> <docker-image>"
197- print (fmt .format (app = os .path .split (sys .argv [0 ])[1 ]))
198- sys .exit (1 )
199-
200- max_number_spiders_to_run = int (sys .argv [1 ])
201- max_seconds_spiders_to_run = int (sys .argv [2 ])
202- output_dir = sys .argv [3 ]
203- docker_image = sys .argv [4 ]
204-
205- spiders_left_to_run = SpidersContainer (docker_image ).spiders ()
289+ #
290+ # parse command line
291+ #
292+ clp = CommandLineParser ()
293+ (clo , cla ) = clp .parse_args ()
294+
295+ (output_dir , docker_image ) = cla
296+
297+ #
298+ # configure logging ... remember gmt = utc
299+ #
300+ logging .Formatter .converter = time .gmtime
301+ logging .basicConfig (
302+ level = clo .logging_level ,
303+ datefmt = '%Y-%m-%d %H:%M:%S' ,
304+ format = '%(asctime)s %(levelname)s %(module)s:%(lineno)d %(message)s' )
305+
306+ #
307+ # Useful for debugging
308+ #
309+ _logger .info ('Output >>>{output_dir}<<<' .format (output_dir = output_dir ))
310+ _logger .info ('Docker image >>>{docker_image}<<<' .format (docker_image = docker_image ))
311+ _logger .info ('Max # spider to run >>>{max_num_spiders}<<<' .format (max_num_spiders = clo .max_number_spiders_to_run ))
312+ msg = 'Max # seconds for spider to run >>>{max_num_seconds}<<<' .format (
313+ max_num_seconds = clo .max_seconds_spiders_to_run )
314+ _logger .info (msg )
315+
316+ #
317+ # now the real work begins ...
318+ #
319+ filenames_by_spider_name = SpidersContainer (docker_image ).spiders ()
320+
321+ spiders_left_to_run = list (filenames_by_spider_name .keys ())
206322
207323 running_spiders = []
208324 run_spiders = []
209325
326+ _logger .info ('Spiders to run {spiders}' .format (spiders = spiders_left_to_run ))
327+
210328 while spiders_left_to_run or running_spiders :
211329 # check if any of the running spiders have finished
212330 for running_spider in running_spiders :
213331 if running_spider .is_finished ():
214- print ( '>>>{spider}<<< finished running - {status}' .format (
332+ msg = '>>>{spider}<<< finished running - {status}' .format (
215333 spider = running_spider .spider ,
216- status = 'success' if running_spider .is_success () else 'failure' ))
334+ status = 'success' if running_spider .is_success () else 'failure' )
335+ _logger .info (msg )
217336 running_spider .save_output (output_dir )
218337
219338 running_spiders .remove (running_spider )
220339 run_spiders .append (running_spider )
221340 else :
222341 number_seconds_running = running_spider .number_seconds_running ()
223342
224- if max_seconds_spiders_to_run < number_seconds_running :
343+ if clo . max_seconds_spiders_to_run < number_seconds_running :
225344 msg_fmt = (
226345 '>>>{spider}<<< ran for {seconds:.0f} '
227346 'seconds which is too long (> {max} seconds) - killing spider'
228347 )
229- print ( msg_fmt .format (
348+ msg = msg_fmt .format (
230349 spider = running_spider .spider ,
231350 seconds = number_seconds_running ,
232- max = max_seconds_spiders_to_run ))
351+ max = clo .max_seconds_spiders_to_run )
352+ _logger .info (msg )
233353
234354 running_spider .kill ()
235355 running_spider .save_output (
@@ -239,18 +359,19 @@ def _copy_debug_file(self, output, debug_file_property, spider_output_dir, debug
239359 running_spiders .remove (running_spider )
240360 run_spiders .append (running_spider )
241361 else :
242- print ( '>>>{spider}<<< still running after {seconds:.0f} seconds' .format (
362+ msg = '>>>{spider}<<< still running after {seconds:.0f} seconds' .format (
243363 spider = running_spider .spider ,
244- seconds = number_seconds_running ))
364+ seconds = number_seconds_running )
365+ _logger .info (msg )
245366
246367 # start spiders left to run until max # of spiders running reached
247368 while spiders_left_to_run :
248- if len (running_spiders ) < max_number_spiders_to_run :
369+ if len (running_spiders ) < clo . max_number_spiders_to_run :
249370 spider = spiders_left_to_run .pop (0 )
250- cc = CrawlContainer (spider , docker_image )
371+ cc = CrawlContainer (spider , filenames_by_spider_name [ spider ], docker_image )
251372 cc .start ()
252373 running_spiders .append (cc )
253- print ('>>>{spider}<<< started running' .format (spider = spider ))
374+ _logger . info ('>>>{spider}<<< started running' .format (spider = spider ))
254375 else :
255376 break
256377
0 commit comments