Skip to content

Commit 430ce9f

Browse files
committed
feat: material changes to metadata presentation
1 parent ece9de3 commit 430ce9f

File tree

7 files changed

+223
-44
lines changed

7 files changed

+223
-44
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ This project adheres to [Semantic Versioning](http://semver.org/).
2424
* docker based development environment now parses repo's setup.py for pre-reqs
2525
that need to be install when the development docker image is built - this
2626
change enabled the removal of requirements.txt from the repo's root directory
27+
* change format of metadata returned by ```spiders.py``` and ```cloudfeaster.spider.Spider```
2728

2829
### Removed
2930

bin/README.md

Lines changed: 111 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,11 +114,121 @@ popd > /dev/null
114114

115115
### [run-all-spiders.sh](run-all-spiders.sh)
116116

117+
* for use by a spider author during spider development
117118
* calls ```run-spider.sh``` for all spiders in a repo
119+
* example of command line usage
120+
121+
```bash
122+
(env) dave@Daves-New-New-Mac-Mini gaming-spiders % run-all-spiders.sh
123+
/Users/dave/gaming-spiders/gaming_spiders/gamehouseonlinegames.py
124+
/Users/dave/gaming-spiders/gaming_spiders/gamesonly.py
125+
/Users/dave/gaming-spiders/gaming_spiders/hiddenobjectgames.py
126+
/Users/dave/gaming-spiders/gaming_spiders/mahjonggames.py
127+
/Users/dave/gaming-spiders/gaming_spiders/match3games.py
128+
/Users/dave/gaming-spiders/gaming_spiders/mindgames.py
129+
/Users/dave/gaming-spiders/gaming_spiders/miniclip.py
130+
/Users/dave/gaming-spiders/gaming_spiders/msnonlinegames.py
131+
/Users/dave/gaming-spiders/gaming_spiders/solitaireonline.py
132+
/Users/dave/gaming-spiders/gaming_spiders/zygomatic.py
133+
(env) dave@Daves-New-New-Mac-Mini gaming-spiders %
134+
```
118135

119136
### [run-spider.sh](run-spider.sh)
120137

121-
* runs a spider in a spider repo using ```run_spider.sh miniclip```
138+
* for use by a spider author during spider development
139+
* runs a spider in a development docker container created
140+
from a docker image named by the DEV_ENV_DOCKER_IMAGE
141+
environment variable
142+
* example of command line usage
143+
144+
```bash
145+
(env) dave@Daves-New-New-Mac-Mini gaming-spiders % pwd
146+
/Users/dave/gaming-spiders
147+
(env) dave@Daves-New-New-Mac-Mini gaming-spiders % ls -la gaming_spiders/*.py
148+
-rw-r--r-- 1 dave staff 22 26 Aug 2020 gaming_spiders/__init__.py
149+
-rwxr-xr-x 1 dave staff 1780 26 Jan 20:00 gaming_spiders/gamehouseonlinegames.py
150+
-rwxr-xr-x 1 dave staff 1256 26 Jan 19:58 gaming_spiders/gamesonly.py
151+
-rwxr-xr-x 1 dave staff 638 26 Aug 2020 gaming_spiders/hiddenobjectgames.py
152+
-rwxr-xr-x 1 dave staff 618 26 Aug 2020 gaming_spiders/mahjonggames.py
153+
-rwxr-xr-x 1 dave staff 614 26 Aug 2020 gaming_spiders/match3games.py
154+
-rwxr-xr-x 1 dave staff 606 26 Aug 2020 gaming_spiders/mindgames.py
155+
-rwxr-xr-x 1 dave staff 1363 26 Jan 22:04 gaming_spiders/miniclip.py
156+
-rwxr-xr-x 1 dave staff 1210 26 Jan 22:18 gaming_spiders/msnonlinegames.py
157+
-rwxr-xr-x 1 dave staff 630 26 Aug 2020 gaming_spiders/solitaireonline.py
158+
-rw-r--r-- 1 dave staff 1397 26 Jan 20:11 gaming_spiders/zygomatic.py
159+
(env) dave@Daves-New-New-Mac-Mini gaming-spiders % run-spider.sh miniclip.py | jq .
160+
{
161+
"1": {
162+
"title": "1 8 Ball Pool",
163+
"link": "https://www.miniclip.com/games/8-ball-pool-multiplayer/en/#t-w-t-H"
164+
},
165+
"2": {
166+
"title": "2 Agar.io",
167+
"link": "https://www.miniclip.com/games/agar-io/en/#t-w-t-H"
168+
},
169+
"3": {
170+
"title": "3 Flip Master",
171+
"link": "https://www.miniclip.com/games/flip-master/en/#t-w-t-H"
172+
},
173+
"4": {
174+
"title": "4 Krunker.io",
175+
"link": "https://www.miniclip.com/games/krunkerio/en/#t-w-t-H"
176+
},
177+
"5": {
178+
"title": "5 Soccer Stars Mobile",
179+
"link": "https://www.miniclip.com/games/soccer-stars-mobile/en/#t-w-t-H"
180+
},
181+
"6": {
182+
"title": "6 Short Ride",
183+
"link": "https://www.miniclip.com/games/short-ride/en/#t-w-t-H"
184+
},
185+
"7": {
186+
"title": "7 Quick Fire Pool Instant",
187+
"link": "https://www.miniclip.com/games/quickfire-pool-instant/en/#t-w-t-H"
188+
},
189+
"8": {
190+
"title": "8 Bubble Trouble",
191+
"link": "https://www.miniclip.com/games/bubble-trouble/en/#t-w-t-H"
192+
},
193+
"9": {
194+
"title": "9 Tanki Online",
195+
"link": "https://www.miniclip.com/games/tanki-online/en/#t-w-t-H"
196+
},
197+
"10": {
198+
"title": "10 Head Ball 2",
199+
"link": "https://www.miniclip.com/games/head-ball-2/en/#t-w-t-H"
200+
},
201+
"_metadata": {
202+
"status": {
203+
"code": 0,
204+
"message": "Ok"
205+
},
206+
"spider": {
207+
"name": "miniclip.py",
208+
"version": "sha256:48c81ae8b86cbf71035905ebc0c4ead321b823fafe093dbf5624912808f4b954"
209+
},
210+
"crawlArgs": [],
211+
"crawlTime": {
212+
"started": "2021-03-02T16:32:56.198453+00:00",
213+
"durationInMs": 4157
214+
}
215+
},
216+
"_debug": {
217+
"screenshot": "/var/folders/zc/51nmqy_93559vqw_1y526y240000gn/T/tmp.0llGnGgS/screenshot.png",
218+
"crawlLog": "/var/folders/zc/51nmqy_93559vqw_1y526y240000gn/T/tmp.0llGnGgS/crawl-log.txt",
219+
"chromeDriverLog": "/var/folders/zc/51nmqy_93559vqw_1y526y240000gn/T/tmp.0llGnGgS/chromedriver-log.txt"
220+
}
221+
}
222+
(env) dave@Daves-New-New-Mac-Mini gaming-spiders % ls -la /var/folders/zc/51nmqy_93559vqw_1y526y240000gn/T/tmp.0llGnGgS
223+
total 3112
224+
drwx------ 6 dave staff 192 2 Mar 11:33 .
225+
drwx------@ 104 dave staff 3328 2 Mar 11:32 ..
226+
-rw------- 1 dave staff 170642 2 Mar 11:33 chromedriver-log.txt
227+
-rw------- 1 dave staff 0 2 Mar 11:32 crawl-log.txt
228+
-rw-r--r-- 1 dave staff 1618 2 Mar 11:33 crawl-output.json
229+
-rw------- 1 dave staff 1414454 2 Mar 11:33 screenshot.png
230+
(env) dave@Daves-New-New-Mac-Mini gaming-spiders %
231+
```
122232

123233
## Utilities
124234

bin/int-test-run-all-spiders-in-ci-pipeline.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,13 +61,15 @@ def spiders(self):
6161
'spiders.py',
6262
]
6363

64-
output = json.loads(subprocess.check_output(args).decode('UTF-8').strip())
64+
spider_metadata_by_spider_by_category = json.loads(subprocess.check_output(args).decode('UTF-8').strip())
6565

66-
rv = list(output.keys())
67-
rv.remove('_metadata')
68-
rv.sort()
66+
filenames = set()
6967

70-
return rv
68+
for spider_metadata_by_spider in spider_metadata_by_spider_by_category.values():
69+
for spider_metadata in spider_metadata_by_spider.values():
70+
filenames.add(spider_metadata['absoluteFilename'])
71+
72+
return list(filenames)
7173

7274

7375
class CrawlContainer(object):

bin/spiders.py

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
# -*- coding: utf-8 -*-
33

44
import hashlib
5-
import importlib
65
import inspect
76
import json
87
import logging
@@ -107,28 +106,23 @@ def parse_args(self, *args, **kwargs):
107106
# now discover some spiders:-)
108107
#
109108
sd = SpiderDiscovery(clo.samples)
110-
metadata = sd.discover()
111-
updated_metadata = {}
112-
for spider_name in metadata.keys():
113-
spider_module_name = '.'.join(spider_name.split('.')[:-1])
114-
spider_module = importlib.import_module(spider_module_name)
115-
updated_metadata[os.path.basename(spider_module.__file__)] = metadata[spider_name]
109+
output = sd.discover()
116110

117111
#
118112
# add _metadata
119113
#
120114
module = sys.modules['__main__']
121115
source = inspect.getsource(module)
122-
hash = hashlib.sha256(source.encode('UTF-8'))
116+
version_hash = hashlib.sha256(source.encode('UTF-8'))
123117

124-
updated_metadata['_metadata'] = {
118+
output['_metadata'] = {
125119
'name': os.path.basename(__file__),
126-
'version': '%s:%s' % (hash.name, hash.hexdigest()),
120+
'version': '{name}:{hash_digest}'.format(name=version_hash.name, hash_digest=version_hash.hexdigest()),
127121
}
128122

129123
#
130124
# finally generate some stdout
131125
#
132-
print(json.dumps(updated_metadata))
126+
print(json.dumps(output))
133127

134128
sys.exit(0)

cloudfeaster/samples/xe_exchange_rates.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@ class XEExchangeRatesSpider(spider.Spider):
1616
def get_metadata(cls):
1717
return {
1818
'url': 'https://www.xe.com/?cn=cad',
19+
'categories': [
20+
cls.get_default_category(),
21+
'fx_rates',
22+
],
1923
}
2024

2125
def crawl(self, browser):

cloudfeaster/spider.py

Lines changed: 52 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -129,13 +129,9 @@ def get_validated_metadata(cls):
129129
# A spider can appear in more than one category.
130130
# A spider's categories are declared as part of the spider's metadata.
131131
# If no categories are declared in a spider's metadata then the
132-
# spider's package name (per the above) is used as the spider's
133-
# category. One caveat, by convention, package names often end
134-
# with _spiders. When a category name is generated from a spider's
135-
# package name, the trailing _spiders is removed from the package name.
132+
# spider's default category is used.
136133
#
137-
category = cls._replace_spiders_postfix_reg_ex.sub('', cls.__module__.split('.')[0])
138-
metadata['categories'] = [category]
134+
metadata['categories'] = [cls.get_default_category()]
139135

140136
metadata['absoluteFilename'] = sys.modules[cls.__module__].__file__
141137
metadata['fullyQualifiedClassName'] = '{module}.{cls}'.format(
@@ -219,6 +215,26 @@ def get_validated_metadata(cls):
219215

220216
return metadata
221217

218+
@classmethod
219+
def get_default_category(cls):
220+
"""A spider's fully qualified name will be something like gaming_spiders.miniclip.Spider
221+
A spider's default category is everything up to but not including the first period a
222+
spider's fully qualified name.
223+
224+
get_default_category() simplifies specificiation of metadata when spiders are in
225+
mulitple categories. See sample code below.
226+
227+
def get_metadata(cls):
228+
return {
229+
'url': 'https://www.xe.com/?cn=cad',
230+
'categories': [
231+
cls.get_default_category(),
232+
'fx_rates',
233+
],
234+
}
235+
"""
236+
return cls._replace_spiders_postfix_reg_ex.sub('', cls.__module__.split('.')[0])
237+
222238
@classmethod
223239
def get_metadata(cls):
224240
"""Spider classes should override this method to return
@@ -1020,7 +1036,26 @@ def filter(filename): return(filename.endswith('.py') and not filename.startswit
10201036
# the concrete subclasses of ```cloudfeaster.spider.Spider```
10211037
# which will be the spiders we're interested in
10221038
#
1023-
return self._find_concrete_spider_classes(Spider)
1039+
concrete_spider_classes = self._find_concrete_spider_classes(Spider)
1040+
1041+
#
1042+
# now for some fancy formatting of the results
1043+
#
1044+
rv = {}
1045+
for concrete_spider_class in concrete_spider_classes:
1046+
metadata = concrete_spider_class.get_validated_metadata()
1047+
fully_qualified_class = metadata['fullyQualifiedClassName']
1048+
spider = fully_qualified_class.split('.')[-2]
1049+
1050+
for category in metadata['categories']:
1051+
if category not in rv:
1052+
rv[category] = {}
1053+
rv[category][spider] = metadata
1054+
1055+
#
1056+
# all done!
1057+
#
1058+
return rv
10241059

10251060
def _find_concrete_spider_classes(self, base_class):
10261061
base_msg = "looking for concrete spider classes of base class '%s.%s'" % (
@@ -1029,22 +1064,24 @@ def _find_concrete_spider_classes(self, base_class):
10291064
)
10301065
_logger.info(base_msg)
10311066

1032-
rv = {}
1067+
concrete_spider_classes = []
10331068
for sub_class in base_class.__subclasses__():
1034-
full_sub_class_name = '%s.%s' % (sub_class.__module__, sub_class.__name__)
1069+
fully_qualified_class_name = '%s.%s' % (sub_class.__module__, sub_class.__name__)
10351070

1036-
_logger.info("%s - assessing '%s'", base_msg, full_sub_class_name)
1071+
_logger.info("%s - assessing '%s'", base_msg, fully_qualified_class_name)
10371072

10381073
if not sub_class.__subclasses__():
1039-
_logger.info("%s - identified concrete class '%s'", base_msg, full_sub_class_name)
1074+
msg_fmt = "{base_msg} - identified concrete class '{class_name}'"
1075+
msg = msg_fmt.format(base_msg=base_msg, class_name=fully_qualified_class_name)
1076+
_logger.info(msg)
10401077

1041-
rv[full_sub_class_name] = sub_class.get_validated_metadata()
1078+
concrete_spider_classes.append(sub_class)
10421079
else:
1043-
_logger.info("%s - identified abstract class '%s'", base_msg, full_sub_class_name)
1080+
_logger.info("%s - identified abstract class '%s'", base_msg, fully_qualified_class_name)
10441081

1045-
rv.update(self._find_concrete_spider_classes(sub_class))
1082+
concrete_spider_classes.extend(self._find_concrete_spider_classes(sub_class))
10461083

1047-
return rv
1084+
return concrete_spider_classes
10481085

10491086
@classmethod
10501087
def load_and_discover_all_spiders_in_package(cls, spider_package_name):

cloudfeaster/tests/spider_tests.py

Lines changed: 43 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import hashlib
44
import http.server
5+
import importlib
56
import inspect
67
import re
78
import sys
@@ -1536,27 +1537,54 @@ def test_get_selected_and_select_by_visible_text_on_non_select_element(self):
15361537

15371538

15381539
class TestSpiderDiscovery(unittest.TestCase):
1539-
"""A series of unit tests that validate ```spider.SpiderDiscovery```."""
1540+
"""A series of unit tests that validate ```spider.SpiderDiscovery```.
1541+
1542+
:ADDRESS: not really super happy with this test. Feels like a lot of hard coding
1543+
and assumptions. Maybe not but just feels like that.
1544+
"""
15401545

15411546
def test_spiders_no_samples(self):
1542-
spider.SpiderDiscovery.load_and_discover_all_spiders_in_package('cloudfeaster.samples')
1543-
spider.SpiderDiscovery.load_and_discover_all_spiders_in_package('cloudfeaster.tests.some_test_spiders')
1547+
importlib.import_module('cloudfeaster.samples')
1548+
importlib.import_module('cloudfeaster.samples.pypi')
1549+
importlib.import_module('cloudfeaster.samples.pythonwheels')
1550+
importlib.import_module('cloudfeaster.samples.xe_exchange_rates')
1551+
1552+
importlib.import_module('cloudfeaster.tests.some_test_spiders')
1553+
importlib.import_module('cloudfeaster.tests.some_test_spiders.abstract')
1554+
importlib.import_module('cloudfeaster.tests.some_test_spiders.supersimpleconcrete')
15441555

15451556
sd = spider.SpiderDiscovery()
1546-
spiders_by_spider_name = sd.discover()
1557+
spiders_by_category = sd.discover()
1558+
1559+
#
1560+
# only expecting a certain set of categories
1561+
#
1562+
categories = list(spiders_by_category.keys())
1563+
categories.sort()
1564+
expected_categories = [
1565+
'cloudfeaster',
1566+
'fx_rates',
1567+
]
1568+
expected_categories.sort()
1569+
self.assertEqual(categories, expected_categories)
1570+
1571+
spiders_by_spider_name = spiders_by_category['cloudfeaster']
15471572

15481573
#
1549-
# confirm available sample spiders
1574+
# only expecting a certain set of spiders
15501575
#
1551-
def is_sample_spider_name(k):
1552-
return k.startswith('cloudfeaster.samples')
1553-
spider_names = [k for k in spiders_by_spider_name.keys() if is_sample_spider_name(k)]
1576+
fqcn_key = 'fullyQualifiedClassName'
1577+
1578+
def is_sample(metadata):
1579+
return metadata[fqcn_key].startswith('cloudfeaster.samples')
1580+
1581+
spider_names = [metadata[fqcn_key] for metadata in spiders_by_spider_name.values() if is_sample(metadata)]
15541582
spider_names.sort()
15551583

15561584
expected_spider_names = [
1557-
'cloudfeaster.samples.xe_exchange_rates.XEExchangeRatesSpider',
15581585
'cloudfeaster.samples.pypi.PyPISpider',
15591586
'cloudfeaster.samples.pythonwheels.PythonWheelsSpider',
1587+
'cloudfeaster.samples.xe_exchange_rates.XEExchangeRatesSpider',
15601588
]
15611589
expected_spider_names.sort()
15621590

@@ -1565,9 +1593,12 @@ def is_sample_spider_name(k):
15651593
#
15661594
# confirm available test spiders which explore concrete and abstract spider classes
15671595
#
1568-
def is_test_spider_name(k):
1569-
return k.startswith('cloudfeaster.tests.some_test_spiders')
1570-
spider_names = [k for k in spiders_by_spider_name.keys() if is_test_spider_name(k)]
1596+
fqcn_key = 'fullyQualifiedClassName'
1597+
1598+
def is_test(metadata):
1599+
return metadata[fqcn_key].startswith('cloudfeaster.tests.some_test_spiders')
1600+
1601+
spider_names = [metadata[fqcn_key] for metadata in spiders_by_spider_name.values() if is_test(metadata)]
15711602
spider_names.sort()
15721603

15731604
expected_spider_names = [

0 commit comments

Comments
 (0)