Skip to content

Commit 8c54414

Browse files
committed
fix: silly bug in int-test-run-all-spiders-in-ci-pipeline.py
1 parent 4a11e03 commit 8c54414

File tree

2 files changed

+152
-29
lines changed

2 files changed

+152
-29
lines changed

CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,9 @@ This project adheres to [Semantic Versioning](http://semver.org/).
1212

1313
### Changed
1414

15-
* Nothing
15+
* fixed a silly bug in ```int-test-run-all-spiders-in-ci-pipeline.py``` and how
16+
it made the command unusable - also put in real python logging for this command
17+
and real command line option handling
1618

1719
### Removed
1820

bin/int-test-run-all-spiders-in-ci-pipeline.py

Lines changed: 149 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -34,18 +34,108 @@
3434
# -v $(repo-root-dir.sh):/app \
3535
# ${DEV_ENV_DOCKER_IMAGE} \
3636
# /bin/bash
37-
# root@fb740b14ab66:/app#
37+
# root@fb740b14ab66:/app# int-test-run-all-spiders-in-ci-pipeline.py \
38+
# --max-num-spiders-to-run 1 \
39+
# --max-num-seconds-spiders-run 30 \
40+
# --log=info \
41+
# /app/dave \
42+
# simonsdave/gaming-spiders:bindle
3843
#
3944

4045
import datetime
4146
import json
47+
import logging
48+
import optparse
4249
import os
50+
import re
4351
import sys
4452
import subprocess
4553
import time
4654

4755
import dateutil.parser
4856

57+
import cloudfeaster
58+
59+
_logger = logging.getLogger(__name__)
60+
61+
62+
def _check_logging_level(option, opt, value):
63+
"""Type checking function for command line parser's 'logginglevel' type."""
64+
reg_ex_pattern = "^(DEBUG|INFO|WARNING|ERROR|CRITICAL)$"
65+
reg_ex = re.compile(reg_ex_pattern, re.IGNORECASE)
66+
if reg_ex.match(value):
67+
return getattr(logging, value.upper())
68+
fmt = (
69+
"option %s: should be one of "
70+
"DEBUG, INFO, WARNING, ERROR or CRITICAL"
71+
)
72+
raise optparse.OptionValueError(fmt % opt)
73+
74+
75+
class CommandLineOption(optparse.Option):
76+
"""Adds new option types to the command line parser's base option types."""
77+
new_types = (
78+
'logginglevel',
79+
)
80+
TYPES = optparse.Option.TYPES + new_types
81+
TYPE_CHECKER = optparse.Option.TYPE_CHECKER.copy()
82+
TYPE_CHECKER['logginglevel'] = _check_logging_level
83+
84+
85+
class CommandLineParser(optparse.OptionParser):
86+
87+
def __init__(self):
88+
89+
optparse.OptionParser.__init__(
90+
self,
91+
'usage: %prog [options] <output-dir> <docker-image>',
92+
description='discover spiders',
93+
version='%%prog %s' % cloudfeaster.__version__,
94+
option_class=CommandLineOption)
95+
96+
default = 1
97+
fmt = '# spiders to run @ same time - default = {default}'
98+
help = fmt.format(default=default)
99+
self.add_option(
100+
'--max-num-spiders-to-run',
101+
action='store',
102+
type='int',
103+
dest='max_number_spiders_to_run',
104+
default=default,
105+
help=help)
106+
107+
default = 60
108+
fmt = 'max # seconds to run spider - default = {default}'
109+
help = fmt.format(default=default)
110+
self.add_option(
111+
'--max-num-seconds-spiders-run',
112+
action='store',
113+
type='int',
114+
dest='max_seconds_spiders_to_run',
115+
default=default,
116+
help=help)
117+
118+
default = logging.ERROR
119+
fmt = (
120+
"logging level [DEBUG,INFO,WARNING,ERROR,CRITICAL] - "
121+
"default = %s"
122+
)
123+
help = fmt % logging.getLevelName(default)
124+
self.add_option(
125+
"--log",
126+
action="store",
127+
dest="logging_level",
128+
default=default,
129+
type="logginglevel",
130+
help=help)
131+
132+
def parse_args(self, *args, **kwargs):
133+
(clo, cla) = optparse.OptionParser.parse_args(self, *args, **kwargs)
134+
if len(cla) != 2:
135+
self.error('output dir & docker image are required')
136+
137+
return (clo, cla)
138+
49139

50140
class SpidersContainer(object):
51141

@@ -65,21 +155,22 @@ def spiders(self):
65155
all_the_metadata = json.loads(subprocess.check_output(args).decode('UTF-8').strip())
66156
del all_the_metadata['_metadata']
67157

68-
filenames = set()
158+
filenames_by_spider_name = {}
69159

70160
for (category, spiders) in all_the_metadata.items():
71161
for (name, metadata) in spiders.items():
72-
filenames.add(metadata['absoluteFilename'])
162+
filenames_by_spider_name[name] = metadata['absoluteFilename']
73163

74-
return list(filenames)
164+
return filenames_by_spider_name
75165

76166

77167
class CrawlContainer(object):
78168

79-
def __init__(self, spider, docker_image):
169+
def __init__(self, spider, absolute_filename, docker_image):
80170
object.__init__(self)
81171

82172
self.spider = spider
173+
self.absolute_filename = absolute_filename
83174
self.docker_image = docker_image
84175

85176
self.container_id = None
@@ -93,7 +184,7 @@ def start(self):
93184
'run',
94185
'-d',
95186
self.docker_image,
96-
self.spider,
187+
self.absolute_filename,
97188
]
98189
self.container_id = subprocess.check_output(args).decode('UTF-8').strip()
99190

@@ -157,7 +248,10 @@ def number_seconds_running(self):
157248
return (now - start_date).total_seconds()
158249

159250
def save_output(self, output_dir, output=None):
160-
spider_output_dir = os.path.join(output_dir, os.path.splitext(self.spider)[0])
251+
spider_output_dir = os.path.join(
252+
output_dir,
253+
self.spider)
254+
# os.makedirs() will throw an exception if there's an error
161255
os.makedirs(spider_output_dir)
162256

163257
if not output:
@@ -192,44 +286,70 @@ def _copy_debug_file(self, output, debug_file_property, spider_output_dir, debug
192286

193287

194288
if __name__ == "__main__":
195-
if len(sys.argv) != 5:
196-
fmt = "usage: {app} <#-spiders-2-run-at-same-time> <max-secs-for-spider-to-run> <output-dir> <docker-image>"
197-
print(fmt.format(app=os.path.split(sys.argv[0])[1]))
198-
sys.exit(1)
199-
200-
max_number_spiders_to_run = int(sys.argv[1])
201-
max_seconds_spiders_to_run = int(sys.argv[2])
202-
output_dir = sys.argv[3]
203-
docker_image = sys.argv[4]
204-
205-
spiders_left_to_run = SpidersContainer(docker_image).spiders()
289+
#
290+
# parse command line
291+
#
292+
clp = CommandLineParser()
293+
(clo, cla) = clp.parse_args()
294+
295+
(output_dir, docker_image) = cla
296+
297+
#
298+
# configure logging ... remember gmt = utc
299+
#
300+
logging.Formatter.converter = time.gmtime
301+
logging.basicConfig(
302+
level=clo.logging_level,
303+
datefmt='%Y-%m-%d %H:%M:%S',
304+
format='%(asctime)s %(levelname)s %(module)s:%(lineno)d %(message)s')
305+
306+
#
307+
# Useful for debugging
308+
#
309+
_logger.info('Output >>>{output_dir}<<<'.format(output_dir=output_dir))
310+
_logger.info('Docker image >>>{docker_image}<<<'.format(docker_image=docker_image))
311+
_logger.info('Max # spider to run >>>{max_num_spiders}<<<'.format(max_num_spiders=clo.max_number_spiders_to_run))
312+
msg = 'Max # seconds for spider to run >>>{max_num_seconds}<<<'.format(
313+
max_num_seconds=clo.max_seconds_spiders_to_run)
314+
_logger.info(msg)
315+
316+
#
317+
# now the real work begins ...
318+
#
319+
filenames_by_spider_name = SpidersContainer(docker_image).spiders()
320+
321+
spiders_left_to_run = list(filenames_by_spider_name.keys())
206322

207323
running_spiders = []
208324
run_spiders = []
209325

326+
_logger.info('Spiders to run {spiders}'.format(spiders=spiders_left_to_run))
327+
210328
while spiders_left_to_run or running_spiders:
211329
# check if any of the running spiders have finished
212330
for running_spider in running_spiders:
213331
if running_spider.is_finished():
214-
print('>>>{spider}<<< finished running - {status}'.format(
332+
msg = '>>>{spider}<<< finished running - {status}'.format(
215333
spider=running_spider.spider,
216-
status='success' if running_spider.is_success() else 'failure'))
334+
status='success' if running_spider.is_success() else 'failure')
335+
_logger.info(msg)
217336
running_spider.save_output(output_dir)
218337

219338
running_spiders.remove(running_spider)
220339
run_spiders.append(running_spider)
221340
else:
222341
number_seconds_running = running_spider.number_seconds_running()
223342

224-
if max_seconds_spiders_to_run < number_seconds_running:
343+
if clo.max_seconds_spiders_to_run < number_seconds_running:
225344
msg_fmt = (
226345
'>>>{spider}<<< ran for {seconds:.0f} '
227346
'seconds which is too long (> {max} seconds) - killing spider'
228347
)
229-
print(msg_fmt.format(
348+
msg = msg_fmt.format(
230349
spider=running_spider.spider,
231350
seconds=number_seconds_running,
232-
max=max_seconds_spiders_to_run))
351+
max=clo.max_seconds_spiders_to_run)
352+
_logger.info(msg)
233353

234354
running_spider.kill()
235355
running_spider.save_output(
@@ -239,18 +359,19 @@ def _copy_debug_file(self, output, debug_file_property, spider_output_dir, debug
239359
running_spiders.remove(running_spider)
240360
run_spiders.append(running_spider)
241361
else:
242-
print('>>>{spider}<<< still running after {seconds:.0f} seconds'.format(
362+
msg = '>>>{spider}<<< still running after {seconds:.0f} seconds'.format(
243363
spider=running_spider.spider,
244-
seconds=number_seconds_running))
364+
seconds=number_seconds_running)
365+
_logger.info(msg)
245366

246367
# start spiders left to run until max # of spiders running reached
247368
while spiders_left_to_run:
248-
if len(running_spiders) < max_number_spiders_to_run:
369+
if len(running_spiders) < clo.max_number_spiders_to_run:
249370
spider = spiders_left_to_run.pop(0)
250-
cc = CrawlContainer(spider, docker_image)
371+
cc = CrawlContainer(spider, filenames_by_spider_name[spider], docker_image)
251372
cc.start()
252373
running_spiders.append(cc)
253-
print('>>>{spider}<<< started running'.format(spider=spider))
374+
_logger.info('>>>{spider}<<< started running'.format(spider=spider))
254375
else:
255376
break
256377

0 commit comments

Comments
 (0)