Skip to content

Commit a952b66

Browse files
committed
Merge pull request #3 from TeamHG-Memex/cdr
Export in CDRv2 format
2 parents 63272d1 + 5533d16 commit a952b66

File tree

4 files changed

+71
-23
lines changed

4 files changed

+71
-23
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,3 +51,4 @@ Useful options to tweak (add to the above command via ``-s NAME=value``):
5151
- ``DOWNLOAD_DELAY`` - set to 0 when crawling local test server
5252
- ``RUN_HH`` - set to 0 to skip running full headless-horesman scripts
5353
- ``PREFER_PAGINATION`` - set to 0 to disable pagination handling
54+
- ``CDR_CRAWLER``, ``CDR_TEAM`` - CDR export metadata constants

undercrawler/items.py

Lines changed: 37 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,44 @@
11
import scrapy
22

33

4-
class PageItem(scrapy.Item):
5-
url = scrapy.Field()
6-
text = scrapy.Field()
7-
is_page = scrapy.Field()
8-
depth = scrapy.Field()
4+
class CDRItem(scrapy.Item):
95

10-
def __repr__(self):
11-
return repr({
12-
'url': self['url'],
13-
'is_page': self['is_page'],
14-
'depth': self['depth'],
15-
})
6+
# (url)-(crawl timestamp), SHA-256 hashed, UPPERCASE (string)
7+
_id = scrapy.Field()
8+
9+
# MIME type (multi (strings))
10+
content_type = scrapy.Field()
11+
12+
# Text label identifying the software used by the crawler (string)
13+
crawler = scrapy.Field()
14+
15+
# Tika/other extraction output (object)
16+
# Our suff here:
17+
# forms: forms metadata as extracted by formasaurus
18+
# depth: page depth
19+
# is_page: this is a page reached by pagination
20+
extracted_metadata = scrapy.Field()
21+
22+
# Tika/other extraction output (string)
23+
extracted_text = scrapy.Field()
1624

25+
# Original source text/html (string)
26+
raw_content = scrapy.Field()
1727

18-
class FormItem(scrapy.Item):
28+
# Text label identifying the team responsible for the crawler (string)
29+
team = scrapy.Field()
30+
31+
# Timestamp of COLLECTION of data from the web (datetime)
32+
# https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-date-format.html#built-in-date-formats
33+
timestamp = scrapy.Field()
34+
35+
# Full URL requested by the crawler (multi (strings))
1936
url = scrapy.Field()
20-
form_type = scrapy.Field()
37+
38+
# Schema version. This document describes schema version 2.0. (float)
39+
version = scrapy.Field()
40+
41+
def __repr__(self):
42+
fields = ['_id', 'url', 'timestamp', 'extracted_metadata']
43+
return '<CDRItem: {}>'.format(', '.join(
44+
'{}: {}'.format(f, repr(self[f])) for f in fields))

undercrawler/settings.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212
AUTOLOGIN_URL = 'http://127.0.0.1:8089'
1313
AUTOLOGIN_ENABLED = True
1414

15+
CDR_CRAWLER = 'scrapy undercrawler'
16+
CDR_TEAM = 'HG'
17+
1518
PREFER_PAGINATION = True
1619

1720
DOWNLOADER_MIDDLEWARES = {

undercrawler/spiders/base_spider.py

Lines changed: 30 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import re
22
import contextlib
3+
from datetime import datetime
4+
import hashlib
35

46
import autopager
57
import formasaurus
@@ -8,7 +10,7 @@
810
from scrapy.utils.url import canonicalize_url
911
from scrapy.utils.python import unique
1012

11-
from ..items import PageItem, FormItem
13+
from ..items import CDRItem
1214

1315

1416
class BaseSpider(scrapy.Spider):
@@ -35,17 +37,16 @@ def splash_request(self, url, callback=None, **kwargs):
3537

3638
def parse(self, response):
3739
url = response.url
38-
self.logger.info(url)
39-
yield PageItem(
40-
url=url,
41-
text=response.text,
40+
if not self.link_extractor.matches(url):
41+
return
42+
43+
forms = formasaurus.extract_forms(response.text) if response.text \
44+
else []
45+
yield self.cdr_item(response, dict(
4246
is_page=response.meta.get('is_page', False),
4347
depth=response.meta.get('depth', None),
44-
)
45-
if response.text:
46-
for _, meta in formasaurus.extract_forms(response.text):
47-
yield FormItem(url=url, form_type=meta['form'])
48-
self.logger.info('Found a %s form at %s', meta['form'], url)
48+
forms=[meta for _, meta in forms],
49+
))
4950

5051
if self.settings.getbool('PREFER_PAGINATION'):
5152
# Follow pagination links; pagination is not a subject of
@@ -62,6 +63,25 @@ def parse(self, response):
6263
for link in self.link_extractor.extract_links(response):
6364
yield self.splash_request(link.url)
6465

66+
def cdr_item(self, response, metadata):
67+
url = response.url
68+
timestamp = int(datetime.utcnow().timestamp() * 1000)
69+
return CDRItem(
70+
_id=hashlib.sha256('{}-{}'.format(url, timestamp).encode('utf-8'))\
71+
.hexdigest().upper(),
72+
content_type=response.headers['content-type']\
73+
.decode('ascii', 'ignore'),
74+
crawler=self.settings.get('CDR_CRAWLER'),
75+
extracted_metadata=metadata,
76+
extracted_text='\n'.join(
77+
response.xpath('//body').xpath('string()').extract()),
78+
raw_content=response.text,
79+
team=self.settings.get('CDR_TEAM'),
80+
timestamp=timestamp,
81+
url=url,
82+
version=2.0,
83+
)
84+
6585
def _normalize_url(self, url):
6686
if not url.startswith('http'):
6787
url = 'http://' + url

0 commit comments

Comments
 (0)