Skip to content

Commit 5533d16

Browse files
committed
Always use CDR format, add extracted_metadata
What was previously stored in PageItem and FormItem is now stored in extracted_metadata: is_page, depth, forms.
1 parent cb37620 commit 5533d16

File tree

4 files changed

+19
-31
lines changed

4 files changed

+19
-31
lines changed

README.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,5 +51,4 @@ Useful options to tweak (add to the above command via ``-s NAME=value``):
5151
- ``DOWNLOAD_DELAY`` - set to 0 when crawling local test server
5252
- ``RUN_HH`` - set to 0 to skip running full headless-horesman scripts
5353
- ``PREFER_PAGINATION`` - set to 0 to disable pagination handling
54-
- ``CDR_EXPORT`` - set to 0 to disable export in CDR format
55-
- ``CDR_*`` - CDR export constants
54+
- ``CDR_CRAWLER``, ``CDR_TEAM`` - CDR export metadata constants

undercrawler/items.py

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,6 @@
11
import scrapy
22

33

4-
class PageItem(scrapy.Item):
5-
url = scrapy.Field()
6-
text = scrapy.Field()
7-
is_page = scrapy.Field()
8-
depth = scrapy.Field()
9-
10-
def __repr__(self):
11-
return repr({
12-
'url': self['url'],
13-
'is_page': self['is_page'],
14-
'depth': self['depth'],
15-
})
16-
17-
184
class CDRItem(scrapy.Item):
195

206
# (url)-(crawl timestamp), SHA-256 hashed, UPPERCASE (string)
@@ -27,6 +13,10 @@ class CDRItem(scrapy.Item):
2713
crawler = scrapy.Field()
2814

2915
# Tika/other extraction output (object)
16+
# Our suff here:
17+
# forms: forms metadata as extracted by formasaurus
18+
# depth: page depth
19+
# is_page: this is a page reached by pagination
3020
extracted_metadata = scrapy.Field()
3121

3222
# Tika/other extraction output (string)
@@ -49,5 +39,6 @@ class CDRItem(scrapy.Item):
4939
version = scrapy.Field()
5040

5141
def __repr__(self):
52-
fields = ['_id', 'url', 'timestamp']
53-
return repr({f: self[f] for f in fields})
42+
fields = ['_id', 'url', 'timestamp', 'extracted_metadata']
43+
return '<CDRItem: {}>'.format(', '.join(
44+
'{}: {}'.format(f, repr(self[f])) for f in fields))

undercrawler/settings.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
AUTOLOGIN_URL = 'http://127.0.0.1:8089'
1313
AUTOLOGIN_ENABLED = True
1414

15-
CDR_EXPORT = True
1615
CDR_CRAWLER = 'scrapy undercrawler'
1716
CDR_TEAM = 'HG'
1817

undercrawler/spiders/base_spider.py

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,13 @@
44
import hashlib
55

66
import autopager
7+
import formasaurus
78
import scrapy
89
from scrapy.linkextractors import LinkExtractor
910
from scrapy.utils.url import canonicalize_url
1011
from scrapy.utils.python import unique
1112

12-
from ..items import PageItem, CDRItem
13+
from ..items import CDRItem
1314

1415

1516
class BaseSpider(scrapy.Spider):
@@ -39,15 +40,13 @@ def parse(self, response):
3940
if not self.link_extractor.matches(url):
4041
return
4142

42-
if self.settings.getbool('CDR_EXPORT'):
43-
yield self.cdr_item(response)
44-
else:
45-
yield PageItem(
46-
url=url,
47-
text=response.text,
48-
is_page=response.meta.get('is_page', False),
49-
depth=response.meta.get('depth', None),
50-
)
43+
forms = formasaurus.extract_forms(response.text) if response.text \
44+
else []
45+
yield self.cdr_item(response, dict(
46+
is_page=response.meta.get('is_page', False),
47+
depth=response.meta.get('depth', None),
48+
forms=[meta for _, meta in forms],
49+
))
5150

5251
if self.settings.getbool('PREFER_PAGINATION'):
5352
# Follow pagination links; pagination is not a subject of
@@ -64,7 +63,7 @@ def parse(self, response):
6463
for link in self.link_extractor.extract_links(response):
6564
yield self.splash_request(link.url)
6665

67-
def cdr_item(self, response):
66+
def cdr_item(self, response, metadata):
6867
url = response.url
6968
timestamp = int(datetime.utcnow().timestamp() * 1000)
7069
return CDRItem(
@@ -73,7 +72,7 @@ def cdr_item(self, response):
7372
content_type=response.headers['content-type']\
7473
.decode('ascii', 'ignore'),
7574
crawler=self.settings.get('CDR_CRAWLER'),
76-
extracted_metadata={},
75+
extracted_metadata=metadata,
7776
extracted_text='\n'.join(
7877
response.xpath('//body').xpath('string()').extract()),
7978
raw_content=response.text,

0 commit comments

Comments
 (0)