TST basic integration tests

kmike · kmike · commit c23eefb29131 · 2018-01-13T06:55:05.000+05:00
diff --git a/.travis.yml b/.travis.yml
@@ -1,27 +1,35 @@
 language: python
-sudo: false
+sudo: required
+
+services:
+    - docker
 
 branches:
     only:
         - master
         - /^\d\.\d+$/
+
 matrix:
-  include:
-    - python: 2.7
-      env: TOXENV=py27
-    - python: 3.4
-      env: TOXENV=py34
-    - python: 3.5
-      env: TOXENV=py35
-    - python: 3.6
-      env: TOXENV=py36
-    - python: 2.7
-      env: TOXENV=py27-scrapy10
+    include:
+        - python: 2.7
+          env: TOXENV=py27
+        - python: 3.4
+          env: TOXENV=py34
+        - python: 3.5
+          env: TOXENV=py35
+        - python: 3.6
+          env: TOXENV=py36
+        - python: 2.7
+          env: TOXENV=py27-scrapy10
+
+before_install:
+    - docker pull scrapinghub/splash
+    - docker run --rm -d -p 8050:8050 --network host scrapinghub/splash
 
 install:
     - pip install -U tox codecov
 
-script: tox
+script: SPLASH_URL=http://127.0.0.1:8050 tox
 
 after_success:
     - codecov
diff --git a/README.rst b/README.rst
@@ -679,3 +679,9 @@ https://github.com/scrapy-plugins/scrapy-splash
 
 To run tests, install "tox" Python package and then run ``tox`` command
 from the source checkout.
+
+To run integration tests, start Splash and set SPLASH_URL env variable
+to Splash address before running ``tox`` command::
+
+   docker run -d --rm -p8050:8050 scrapinghub/splash:3.0
+   SPLASH_URL=http://127.0.0.1:8050 tox -e py36
diff --git a/requirements-test.txt b/requirements-test.txt
@@ -0,0 +1,5 @@
+pytest >= 3.3.2
+pytest-cov >= 2.5.1
+pytest-twisted >= 1.6
+hypothesis >= 3.44.14
+hypothesis-pytest
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,33 @@
+import os
+
+import pytest
+from scrapy.settings import Settings
+
+
+@pytest.fixture()
+def settings(request):
+    """ Default scrapy-splash settings """
+    s = dict(
+        # collect scraped items to .collected_items attribute
+        ITEM_PIPELINES={
+            'tests.utils.CollectorPipeline': 100,
+        },
+
+        # scrapy-splash settings
+        SPLASH_URL=os.environ.get('SPLASH_URL'),
+        DOWNLOADER_MIDDLEWARES={
+            # Engine side
+            'scrapy_splash.SplashCookiesMiddleware': 723,
+            'scrapy_splash.SplashMiddleware': 725,
+            'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
+            # Downloader side
+        },
+        SPIDER_MIDDLEWARES={
+            'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
+        },
+        DUPEFILTER_CLASS='scrapy_splash.SplashAwareDupeFilter',
+        HTTPCACHE_STORAGE='scrapy_splash.SplashAwareFSCacheStorage',
+    )
+    return Settings(s)
+
+
diff --git a/tests/mockserver.py b/tests/mockserver.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+import argparse, socket, sys, time
+from subprocess import Popen, PIPE
+from importlib import import_module
+
+from twisted.internet import reactor
+from twisted.web.server import Site
+
+
+def get_ephemeral_port():
+    s = socket.socket()
+    s.bind(("", 0))
+    return s.getsockname()[1]
+
+
+class MockServer():
+    def __init__(self, resource, port=None):
+        self.resource = '{}.{}'.format(resource.__module__, resource.__name__)
+        self.proc = None
+        host = socket.gethostbyname(socket.gethostname())
+        self.port = port or get_ephemeral_port()
+        self.root_url = 'http://%s:%d' % (host, self.port)
+
+    def __enter__(self):
+        self.proc = Popen(
+            [sys.executable, '-u', '-m', 'tests.mockserver',
+             self.resource, '--port', str(self.port)],
+            stdout=PIPE)
+        self.proc.stdout.readline()
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.proc.kill()
+        self.proc.wait()
+        time.sleep(0.2)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('resource')
+    parser.add_argument('--port', type=int)
+    args = parser.parse_args()
+    module_name, name = args.resource.rsplit('.', 1)
+    sys.path.append('.')
+    resource = getattr(import_module(module_name), name)()
+    http_port = reactor.listenTCP(args.port, Site(resource))
+    def print_listening():
+        host = http_port.getHost()
+        print('Mock server {} running at http://{}:{}'.format(
+            resource, host.host, host.port))
+    reactor.callWhenRunning(print_listening)
+    reactor.run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_integration.py b/tests/test_integration.py
@@ -0,0 +1,108 @@
+# -*- coding: utf-8 -*-
+import scrapy
+from pytest_twisted import inlineCallbacks
+
+from scrapy_splash import SplashRequest
+from .utils import crawl_items, requires_splash, HtmlResource
+
+DEFAULT_SCRIPT = """
+function main(splash)
+  splash:init_cookies(splash.args.cookies)
+  assert(splash:go{
+    splash.args.url,
+    headers=splash.args.headers,
+    http_method=splash.args.http_method,
+    body=splash.args.body,
+    })
+  assert(splash:wait(0.5))
+
+  local entries = splash:history()
+  local last_response = entries[#entries].response
+  return {
+    url = splash:url(),
+    headers = last_response.headers,
+    http_status = last_response.status,
+    cookies = splash:get_cookies(),
+    html = splash:html(),
+    args = splash.args,
+    jsvalue = splash:evaljs("1+2"),
+  }
+end
+"""
+
+
+class HelloWorld(HtmlResource):
+    html = """
+    <html><body><script>document.write('hello world!');</script></body></html>
+    """
+    extra_headers = {'X-MyHeader': 'my value'}
+
+
+class ResponseSpider(scrapy.Spider):
+    """ Make a request to URL, return Scrapy response """
+    url = None
+
+    def start_requests(self):
+        yield SplashRequest(self.url)
+
+    def parse(self, response):
+        yield {'response': response}
+
+
+class ReloadSpider(ResponseSpider):
+    """ Make two requests to URL, store both responses.
+    This spider activates both start_requests and parse methods,
+    and checks that dupefilter takes fragment into account. """
+
+    def parse(self, response):
+        yield {'response': response}
+        yield SplashRequest(self.url + '#foo')
+
+
+class LuaScriptSpider(ResponseSpider):
+    """ Make a request using a Lua script similar to the one from README """
+
+    def start_requests(self):
+        yield SplashRequest(self.url + "#foo", endpoint='execute',
+                            args={'lua_source': DEFAULT_SCRIPT, 'foo': 'bar'})
+
+
+@requires_splash
+@inlineCallbacks
+def test_basic(settings):
+    items, url, crawler = yield crawl_items(ResponseSpider, HelloWorld,
+                                            settings)
+    assert len(items) == 1
+    resp = items[0]['response']
+    assert resp.url == url
+    assert resp.css('body::text').get().strip() == "hello world!"
+
+
+@requires_splash
+@inlineCallbacks
+def test_reload(settings):
+    items, url, crawler = yield crawl_items(ReloadSpider, HelloWorld, settings)
+    assert len(items) == 2
+    assert crawler.stats.get_value('dupefilter/filtered') == 1
+    resp = items[0]['response']
+    assert resp.url == url
+    assert resp.css('body::text').get().strip() == "hello world!"
+
+    resp2 = items[1]['response']
+    assert resp2.body == resp.body
+    assert resp2 is not resp
+    assert resp2.url == resp.url + "#foo"
+
+
+@requires_splash
+@inlineCallbacks
+def test_basic_lua(settings):
+    items, url, crawler = yield crawl_items(LuaScriptSpider, HelloWorld,
+                                            settings)
+    assert len(items) == 1
+    resp = items[0]['response']
+    assert resp.url == url + "/#foo"
+    assert resp.css('body::text').get().strip() == "hello world!"
+    assert resp.data['jsvalue'] == 3
+    assert resp.headers['X-MyHeader'] == b'my value'
+    assert resp.data['args']['foo'] == 'bar'
diff --git a/tests/utils.py b/tests/utils.py
@@ -0,0 +1,59 @@
+# -*- coding: utf-8 -*-
+import os
+import pytest
+from pytest_twisted import inlineCallbacks
+from twisted.web.resource import Resource
+from scrapy.crawler import CrawlerRunner
+from scrapy.utils.python import to_bytes
+from tests.mockserver import MockServer
+
+
+requires_splash = pytest.mark.skipif(
+    not os.environ.get('SPLASH_URL', ''),
+    reason="set SPLASH_URL environment variable to run integrational tests"
+)
+
+
+class HtmlResource(Resource):
+    isLeaf = True
+    content_type = 'text/html'
+    html = ''
+    extra_headers = {}
+
+    def render_GET(self, request):
+        request.setHeader(b'content-type', to_bytes(self.content_type))
+        for name, value in self.extra_headers.items():
+            request.setHeader(to_bytes(name), to_bytes(value))
+        return to_bytes(self.html)
+
+
+@inlineCallbacks
+def crawl_items(spider_cls, resource_cls, settings, spider_kwargs=None):
+    """ Use spider_cls to crawl resource_cls. URL of the resource is passed
+    to the spider as ``url`` argument.
+    Return ``(items, resource_url, crawler)`` tuple.
+    """
+    spider_kwargs = {} if spider_kwargs is None else spider_kwargs
+    crawler = make_crawler(spider_cls, settings)
+    with MockServer(resource_cls) as s:
+        root_url = s.root_url
+        yield crawler.crawl(url=root_url, **spider_kwargs)
+    return crawler.spider.collected_items, s.root_url, crawler
+
+
+def make_crawler(spider_cls, settings):
+    if not getattr(spider_cls, 'name', None):
+        class Spider(spider_cls):
+            name = 'test_spider'
+        Spider.__name__ = spider_cls.__name__
+        Spider.__module__ = spider_cls.__module__
+        spider_cls = Spider
+    return CrawlerRunner(settings).create_crawler(spider_cls)
+
+
+class CollectorPipeline:
+    def process_item(self, item, spider):
+        if not hasattr(spider, 'collected_items'):
+            spider.collected_items = []
+        spider.collected_items.append(item)
+        return item
diff --git a/tox.ini b/tox.ini
@@ -4,19 +4,13 @@
 # and then run "tox" from this directory.
 
 [tox]
-envlist = py27,py34,py35,py36,py27-scrapy10
-
-[base]
-deps =
-    pytest
-    pytest-cov
-    hypothesis
-    hypothesis-pytest
+envlist = py27,py34,py35,py36,py27-scrapy11
 
 [testenv]
+passenv = SPLASH_URL
 deps =
+    -rrequirements-test.txt
     -rrequirements.txt
-    {[base]deps}
 
 commands =
     pip install -e .
@@ -25,11 +19,8 @@ commands =
 [testenv:py34]
 basepython = python3.4
 deps =
+    -rrequirements-test.txt
     -rrequirements-py3.txt
-    pytest
-    pytest-cov
-    hypothesis
-    hypothesis-pytest
 
 [testenv:py35]
 basepython = python3.5
@@ -39,8 +30,8 @@ deps = {[testenv:py34]deps}
 basepython = python3.6
 deps = {[testenv:py34]deps}
 
-[testenv:py27-scrapy10]
+[testenv:py27-scrapy11]
 deps =
-    scrapy < 1.1
+    -rrequirements-test.txt
+    scrapy == 1.1.4
     service_identity
-    {[base]deps}