Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,5 @@ Useful options to tweak (add to the above command via ``-s NAME=value``):
- ``DOWNLOAD_DELAY`` - set to 0 when crawling local test server
- ``RUN_HH`` - set to 0 to skip running full headless-horesman scripts
- ``PREFER_PAGINATION`` - set to 0 to disable pagination handling
- ``CDR_EXPORT`` - set to 0 to disable export in CDR format
- ``CDR_*`` - CDR export constants
37 changes: 35 additions & 2 deletions undercrawler/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,39 @@ def __repr__(self):
})


class FormItem(scrapy.Item):
class CDRItem(scrapy.Item):

# (url)-(crawl timestamp), SHA-256 hashed, UPPERCASE (string)
_id = scrapy.Field()

# MIME type (multi (strings))
content_type = scrapy.Field()

# Text label identifying the software used by the crawler (string)
crawler = scrapy.Field()

# Tika/other extraction output (object)
extracted_metadata = scrapy.Field()

# Tika/other extraction output (string)
extracted_text = scrapy.Field()

# Original source text/html (string)
raw_content = scrapy.Field()

# Text label identifying the team responsible for the crawler (string)
team = scrapy.Field()

# Timestamp of COLLECTION of data from the web (datetime)
# https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-date-format.html#built-in-date-formats
timestamp = scrapy.Field()

# Full URL requested by the crawler (multi (strings))
url = scrapy.Field()
form_type = scrapy.Field()

# Schema version. This document describes schema version 2.0. (float)
version = scrapy.Field()

def __repr__(self):
fields = ['_id', 'url', 'timestamp']
return repr({f: self[f] for f in fields})
4 changes: 4 additions & 0 deletions undercrawler/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
AUTOLOGIN_URL = 'http://127.0.0.1:8089'
AUTOLOGIN_ENABLED = True

CDR_EXPORT = True
CDR_CRAWLER = 'scrapy undercrawler'
CDR_TEAM = 'HG'

PREFER_PAGINATION = True

DOWNLOADER_MIDDLEWARES = {
Expand Down
47 changes: 34 additions & 13 deletions undercrawler/spiders/base_spider.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
import re
import contextlib
from datetime import datetime
import hashlib

import autopager
import formasaurus
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.utils.url import canonicalize_url
from scrapy.utils.python import unique

from ..items import PageItem, FormItem
from ..items import PageItem, CDRItem


class BaseSpider(scrapy.Spider):
Expand All @@ -35,17 +36,18 @@ def splash_request(self, url, callback=None, **kwargs):

def parse(self, response):
url = response.url
self.logger.info(url)
yield PageItem(
url=url,
text=response.text,
is_page=response.meta.get('is_page', False),
depth=response.meta.get('depth', None),
)
if response.text:
for _, meta in formasaurus.extract_forms(response.text):
yield FormItem(url=url, form_type=meta['form'])
self.logger.info('Found a %s form at %s', meta['form'], url)
if not self.link_extractor.matches(url):
return
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍


if self.settings.getbool('CDR_EXPORT'):
yield self.cdr_item(response)
else:
yield PageItem(
url=url,
text=response.text,
is_page=response.meta.get('is_page', False),
depth=response.meta.get('depth', None),
)

if self.settings.getbool('PREFER_PAGINATION'):
# Follow pagination links; pagination is not a subject of
Expand All @@ -62,6 +64,25 @@ def parse(self, response):
for link in self.link_extractor.extract_links(response):
yield self.splash_request(link.url)

def cdr_item(self, response):
url = response.url
timestamp = int(datetime.utcnow().timestamp() * 1000)
return CDRItem(
_id=hashlib.sha256('{}-{}'.format(url, timestamp).encode('utf-8'))\
.hexdigest().upper(),
content_type=response.headers['content-type']\
.decode('ascii', 'ignore'),
crawler=self.settings.get('CDR_CRAWLER'),
extracted_metadata={},
extracted_text='\n'.join(
response.xpath('//body//text()').extract()),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Paul suggested string() xpath function here: scrapy/parsel#34; I've tried it, and output is a bit cleaner.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this is nicer, and less string joining happening, thanks!

raw_content=response.text,
team=self.settings.get('CDR_TEAM'),
timestamp=timestamp,
url=url,
version=2.0,
)

def _normalize_url(self, url):
if not url.startswith('http'):
url = 'http://' + url
Expand Down