diff --git a/.bumpversion.cfg b/.bumpversion.cfg deleted file mode 100644 index 1ca6818..0000000 --- a/.bumpversion.cfg +++ /dev/null @@ -1,8 +0,0 @@ -[bumpversion] -current_version = 2.0.1 -commit = True -tag = True - -[bumpversion:file:setup.py] - -[bumpversion:file:scrapy_deltafetch/__init__.py] diff --git a/.coveragerc b/.coveragerc deleted file mode 100644 index ec1040c..0000000 --- a/.coveragerc +++ /dev/null @@ -1,3 +0,0 @@ -[run] -branch = true -source = scrapy_deltafetch diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 25fabf7..fd707ad 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,41 +1,44 @@ name: CI - on: push: branches: [ master ] pull_request: - branches: [ master ] - jobs: - build: - + test: runs-on: ubuntu-latest strategy: + fail-fast: false matrix: - python-version: [3.5, 3.6, 3.7, 3.8, 3.9] - + include: + - python-version: "3.9" + toxenv: min + - python-version: "3.9" + - python-version: "3.10" + - python-version: "3.11" + - python-version: "3.12" + - python-version: "3.13" + - python-version: "3.13" + toxenv: pre-commit + - python-version: "3.13" + toxenv: mypy + - python-version: "3.13" + toxenv: pylint + - python-version: "3.13" + toxenv: twinecheck steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: libddb run: | sudo apt-get install libdb-dev - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - - name: Cache pip - uses: actions/cache@v2 - with: - path: ~/.cache/pip - key: ${{ runner.os}}-pip-${{ hashFiles('tests/requirements-test.txt') }} - restore-keys: | - ${{ runner.os}}-pip- - ${{ runner.os}}- - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r tests/requirements-test.txt - - name: Test with pytest + - name: Run + env: + TOXENV: ${{ matrix.toxenv }} run: | - pytest \ No newline at end of file + pip install -U tox + tox + - name: Upload coverage report + uses: codecov/codecov-action@v5 diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 175b20f..8d61dc6 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -1,27 +1,23 @@ -name: Upload Python Package - +name: Publish on: - release: - types: [created] - + push: + tags: + - '[0-9]+.[0-9]+.[0-9]+' jobs: - deploy: + publish: runs-on: ubuntu-latest - + environment: + name: pypi + url: https://pypi.org/p/${{ github.event.repository.name }} + permissions: + id-token: write steps: - - uses: actions/checkout@v2 - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: "3.x" - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install setuptools wheel twine - - name: Build and publish - env: - TWINE_USERNAME: __token__ - TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} - run: | - python setup.py sdist bdist_wheel - twine upload dist/* \ No newline at end of file + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.13 + - run: | + python -m pip install --upgrade build + python -m build + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..d3e17be --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,7 @@ +repos: +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.9.7 + hooks: + - id: ruff + args: [ --fix ] + - id: ruff-format diff --git a/CHANGES.rst b/CHANGES.rst index 31e1e5f..6c59cfc 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,5 +1,15 @@ Changes ======= + +2.1.0 (unreleased) +------------------ + +* Drop support for Python 3.8 and lower, add support for Python 3.9 and higher. +* Add support for Scrapy 2.12. +* Use the ``REQUEST_FINGERPRINTER_CLASS`` setting introduced in Scrapy 2.7. +* Support new item types introduced in Scrapy 2.2. +* Support ``Path`` instances in the ``DELTAFETCH_DIR`` setting. + 2.0.0 (2021-09-20) ------------------ * drop Python 2 support diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..1db8257 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,228 @@ +[build-system] +requires = ["setuptools>=61.2"] +build-backend = "setuptools.build_meta" + +[project] +name = "scrapy-deltafetch" +version = "2.0.1" +authors = [{name = "Zyte", email = "opensource@zyte.com"}] +license = {text = "BSD"} +description = "Scrapy middleware to ignore previously crawled pages" +readme = "README.rst" +classifiers = [ + "Development Status :: 4 - Beta", + "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", +] +requires-python = ">=3.9" +dependencies = ["Scrapy>=1.1.0"] + +[project.urls] +Homepage = "http://github.com/scrapy-plugins/scrapy-deltafetch" + +[tool.bumpversion] +current_version = "2.0.1" +commit = true +tag = true + +[[tool.bumpversion.files]] +filename = 'CHANGES.rst' +search = "\\(unreleased\\)$" +replace = "({now:%Y-%m-%d})" +regex = true + +[[tool.bumpversion.files]] +search = "version = \"{current_version}\"" +replace = "version = \"{new_version}\"" +filename = "pyproject.toml" + +[[tool.bumpversion.files]] +filename = "scrapy_deltafetch/__init__.py" + +[tool.coverage.run] +branch = true +include = ["scrapy_deltafetch/*"] +omit = ["tests/*"] +disable_warnings = ["include-ignored"] + +[tool.coverage.paths] +source = [ + "scrapy_deltafetch", + ".tox/**/site-packages/scrapy-deltafetch" +] + +[tool.coverage.report] +# https://github.com/nedbat/coveragepy/issues/831#issuecomment-517778185 +exclude_lines = ["pragma: no cover", "if TYPE_CHECKING:"] + +[tool.pylint.MASTER] +persistent = "no" +jobs = 1 # >1 hides results + +[tool.pylint."MESSAGES CONTROL"] +enable = [ + "useless-suppression", +] +disable = [ + # Ones we want to ignore + "attribute-defined-outside-init", + "broad-exception-caught", + "consider-using-with", + "cyclic-import", + "disallowed-name", + "duplicate-code", # https://github.com/pylint-dev/pylint/issues/214 + "fixme", + "import-outside-toplevel", + "inherit-non-class", # false positives with create_deprecated_class() + "invalid-name", + "invalid-overridden-method", + "isinstance-second-argument-not-valid-type", # false positives with create_deprecated_class() + "line-too-long", + "logging-format-interpolation", + "logging-fstring-interpolation", + "logging-not-lazy", + "missing-docstring", + "no-member", + "no-name-in-module", # caught by mypy already + "no-value-for-parameter", # https://github.com/pylint-dev/pylint/issues/3268 + "not-callable", + "protected-access", + "redefined-builtin", + "redefined-outer-name", + "too-few-public-methods", + "too-many-ancestors", + "too-many-arguments", + "too-many-branches", + "too-many-function-args", + "too-many-instance-attributes", + "too-many-lines", + "too-many-locals", + "too-many-positional-arguments", + "too-many-public-methods", + "too-many-return-statements", + "unused-argument", + "unused-import", + "unused-variable", + "useless-import-alias", # used as a hint to mypy + "useless-return", # https://github.com/pylint-dev/pylint/issues/6530 + "wrong-import-position", +] + +[tool.ruff.lint] +extend-select = [ + # flake8-bugbear + "B", + # flake8-comprehensions + "C4", + # pydocstyle + "D", + # flake8-future-annotations + "FA", + # flynt + "FLY", + # refurb + "FURB", + # isort + "I", + # flake8-implicit-str-concat + "ISC", + # flake8-logging + "LOG", + # Perflint + "PERF", + # pygrep-hooks + "PGH", + # flake8-pie + "PIE", + # pylint + "PL", + # flake8-pytest-style + "PT", + # flake8-use-pathlib + "PTH", + # flake8-pyi + "PYI", + # flake8-quotes + "Q", + # flake8-return + "RET", + # flake8-raise + "RSE", + # Ruff-specific rules + "RUF", + # flake8-bandit + "S", + # flake8-simplify + "SIM", + # flake8-slots + "SLOT", + # flake8-debugger + "T10", + # flake8-type-checking + "TC", + # pyupgrade + "UP", + # pycodestyle warnings + "W", + # flake8-2020 + "YTT", +] +ignore = [ + # Missing docstring in public module + "D100", + # Missing docstring in public class + "D101", + # Missing docstring in public function + "D103", + # Missing docstring in public package + "D104", + # Missing docstring in magic method + "D105", + # Missing docstring in __init__ + "D107", + # One-line docstring should fit on one line with quotes + "D200", + # No blank lines allowed after function docstring + "D202", + # 1 blank line required between summary line and description + "D205", + # Multi-line docstring closing quotes should be on a separate line + "D209", + # First line should end with a period + "D400", + # First line should be in imperative mood; try rephrasing + "D401", + # First line should not be the function's "signature" + "D402", + # Too many return statements + "PLR0911", + # Too many branches + "PLR0912", + # Too many arguments in function definition + "PLR0913", + # Too many statements + "PLR0915", + # Magic value used in comparison + "PLR2004", + # Mutable class attributes should be annotated with `typing.ClassVar` + "RUF012", + # Use of `assert` detected + "S101", +] + +[tool.ruff.lint.per-file-ignores] +# D102: Missing docstring in public method +"tests/**" = ["D102"] + +[tool.ruff.lint.pydocstyle] +convention = "pep257" + +[tool.setuptools] +packages = ["scrapy_deltafetch"] diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 0b9465e..0000000 --- a/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -scrapy>=1.1.0 diff --git a/scrapy_deltafetch/__init__.py b/scrapy_deltafetch/__init__.py index 40b1066..a229437 100644 --- a/scrapy_deltafetch/__init__.py +++ b/scrapy_deltafetch/__init__.py @@ -1,4 +1,4 @@ from .middleware import DeltaFetch - +__all__ = ["DeltaFetch"] __version__ = "2.0.1" diff --git a/scrapy_deltafetch/middleware.py b/scrapy_deltafetch/middleware.py index 9f6f1b4..bd686f7 100644 --- a/scrapy_deltafetch/middleware.py +++ b/scrapy_deltafetch/middleware.py @@ -1,24 +1,20 @@ +import dbm import logging -import os import time -import dbm +from pathlib import Path +from scrapy import signals +from scrapy.exceptions import NotConfigured from scrapy.http import Request -from scrapy.item import Item -from scrapy.utils.request import request_fingerprint from scrapy.utils.project import data_path from scrapy.utils.python import to_bytes -from scrapy.exceptions import NotConfigured -from scrapy import signals - logger = logging.getLogger(__name__) -class DeltaFetch(object): - """ - This is a spider middleware to ignore requests to pages containing items - seen in previous crawls of the same spider, thus producing a "delta crawl" +class DeltaFetch: + """Spider middleware to ignore requests to pages containing items seen in + previous crawls of the same spider, thus producing a "delta crawl" containing only new items. This also speeds up the crawl, by reducing the number of requests that need @@ -32,56 +28,67 @@ def __init__(self, dir, reset=False, stats=None): self.stats = stats @classmethod - def from_crawler(cls, crawler): + def from_crawler(cls, crawler): # noqa: D102 s = crawler.settings - if not s.getbool('DELTAFETCH_ENABLED'): + if not s.getbool("DELTAFETCH_ENABLED"): raise NotConfigured - dir = data_path(s.get('DELTAFETCH_DIR', 'deltafetch')) - reset = s.getbool('DELTAFETCH_RESET') + dir = data_path(s.get("DELTAFETCH_DIR", "deltafetch")) + reset = s.getbool("DELTAFETCH_RESET") o = cls(dir, reset, crawler.stats) + if o.stats is None: + o.stats = crawler.stats crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) + + try: + o.fingerprint = crawler.request_fingerprinter.fingerprint + except AttributeError: + from scrapy.utils.request import request_fingerprint + + o.fingerprint = request_fingerprint + return o - def spider_opened(self, spider): - if not os.path.exists(self.dir): - os.makedirs(self.dir) + def spider_opened(self, spider): # noqa: D102 + dir = Path(self.dir) + dir.mkdir(parents=True, exist_ok=True) # TODO may be tricky, as there may be different paths on systems - dbpath = os.path.join(self.dir, '%s.db' % spider.name) - reset = self.reset or getattr(spider, 'deltafetch_reset', False) - flag = 'n' if reset else 'c' + dbpath = dir / f"{spider.name}.db" + reset = self.reset or getattr(spider, "deltafetch_reset", False) + flag = "n" if reset else "c" try: - self.db = dbm.open(dbpath, flag=flag) + self.db = dbm.open(str(dbpath), flag=flag) # noqa: SIM115 except Exception: - logger.warning("Failed to open DeltaFetch database at %s, " - "trying to recreate it" % dbpath) - if os.path.exists(dbpath): - os.remove(dbpath) - self.db = dbm.open(dbpath, 'c') + logger.warning( + f"Failed to open DeltaFetch database at {dbpath}, trying to recreate it" + ) + if dbpath.exists(): + dbpath.unlink() + self.db = dbm.open(str(dbpath), "c") # noqa: SIM115 - def spider_closed(self, spider): + def spider_closed(self, spider): # noqa: D102 self.db.close() - def process_spider_output(self, response, result, spider): + def process_spider_output(self, response, result, spider): # noqa: D102 for r in result: if isinstance(r, Request): key = self._get_key(r) if key in self.db and self._is_enabled_for_request(r): - logger.info("Ignoring already visited: %s" % r) + logger.info(f"Ignoring already visited: {r}") if self.stats: - self.stats.inc_value('deltafetch/skipped', spider=spider) + self.stats.inc_value("deltafetch/skipped", spider=spider) continue - elif isinstance(r, (Item, dict)): + else: key = self._get_key(response.request) self.db[key] = str(time.time()) if self.stats: - self.stats.inc_value('deltafetch/stored', spider=spider) + self.stats.inc_value("deltafetch/stored", spider=spider) yield r def _get_key(self, request): - key = request.meta.get('deltafetch_key') or request_fingerprint(request) + key = request.meta.get("deltafetch_key") or self.fingerprint(request) return to_bytes(key) def _is_enabled_for_request(self, request): # Gives you option to disable deltafetch for some requests - return request.meta.get('deltafetch_enabled', True) + return request.meta.get("deltafetch_enabled", True) diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 3c6e79c..0000000 --- a/setup.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[bdist_wheel] -universal=1 diff --git a/setup.py b/setup.py deleted file mode 100644 index ad86588..0000000 --- a/setup.py +++ /dev/null @@ -1,27 +0,0 @@ -from setuptools import setup - -setup( - name='scrapy-deltafetch', - version='2.0.1', - license='BSD', - description='Scrapy middleware to ignore previously crawled pages', - long_description=open('README.rst').read(), - author='Zyte', - author_email='opensource@zyte.com', - url='http://github.com/scrapy-plugins/scrapy-deltafetch', - packages=['scrapy_deltafetch'], - platforms=['Any'], - classifiers=[ - 'Development Status :: 4 - Beta', - 'License :: OSI Approved :: BSD License', - 'Operating System :: OS Independent', - 'Programming Language :: Python', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - ], - install_requires=['Scrapy>=1.1.0'] -) diff --git a/tests/benchmark.py b/tests/benchmark.py index a8edee5..555500b 100644 --- a/tests/benchmark.py +++ b/tests/benchmark.py @@ -1,6 +1,6 @@ import tempfile +from unittest import mock -import mock from scrapy import Request, Spider from scrapy.statscollectors import StatsCollector from scrapy.utils.test import get_crawler @@ -9,7 +9,7 @@ def benchmark_middleware(result): - spider_name = 'df_tests' + spider_name = "df_tests" spider = Spider(spider_name) temp_dir = tempfile.gettempdir() crawler = get_crawler(Spider) @@ -17,15 +17,15 @@ def benchmark_middleware(result): mw = DeltaFetch(temp_dir, reset=False, stats=stats) mw.spider_opened(spider) response = mock.Mock() - response.request = Request('http://url', - meta={'deltafetch_key': 'key'}) + response.request = Request("http://url", meta={"deltafetch_key": "key"}) - for x in mw.process_spider_output(response, result, spider): + for _x in mw.process_spider_output(response, result, spider): pass + def test_middleware(benchmark): result = [] for x in range(50000): - request = Request(f'https://{x}') + request = Request(f"https://{x}") result.append(request) result = benchmark(benchmark_middleware, result) diff --git a/tests/requirements-test.txt b/tests/requirements-test.txt deleted file mode 100644 index bf2c733..0000000 --- a/tests/requirements-test.txt +++ /dev/null @@ -1,5 +0,0 @@ --r ../requirements.txt -mock -pytest -pytest-benchmark - diff --git a/tests/test_deltafetch.py b/tests/test_deltafetch.py index 362bf1e..6e42d58 100644 --- a/tests/test_deltafetch.py +++ b/tests/test_deltafetch.py @@ -1,335 +1,352 @@ -from unittest import TestCase, skipIf - -import os import dbm -import mock import tempfile +from dataclasses import dataclass +from pathlib import Path +from unittest import TestCase, mock + +import pytest from scrapy import Request +from scrapy.exceptions import NotConfigured from scrapy.item import Item -from scrapy.spiders import Spider from scrapy.settings import Settings -from scrapy.exceptions import NotConfigured -from scrapy.utils.request import request_fingerprint +from scrapy.spiders import Spider from scrapy.utils.python import to_bytes -from scrapy.statscollectors import StatsCollector from scrapy.utils.test import get_crawler from scrapy_deltafetch.middleware import DeltaFetch class DeltaFetchTestCase(TestCase): - mwcls = DeltaFetch def setUp(self): - self.spider_name = 'df_tests' + self.spider_name = "df_tests" self.spider = Spider(self.spider_name) # DeltaFetch creates .db files named after the spider's name - self.temp_dir = tempfile.gettempdir() - self.db_path = os.path.join(self.temp_dir, '%s.db' % self.spider.name) - - crawler = get_crawler(Spider) - self.stats = StatsCollector(crawler) + self.temp_dir = Path(tempfile.gettempdir()) + self.db_path = self.temp_dir / f"{self.spider.name}.db" + + def get_mw(self, dir=None, reset=None, cls=DeltaFetch): + settings = { + "DELTAFETCH_ENABLED": True, + } + if dir is not None: + settings["DELTAFETCH_DIR"] = dir + if reset is not None: + settings["DELTAFETCH_RESET"] = reset + crawler = get_crawler(Spider, settings_dict=settings) + return cls.from_crawler(crawler) def test_init(self): # path format is any, the folder is not created - instance = self.mwcls('/any/dir', True, stats=self.stats) + instance = self.get_mw("/any/dir", reset=True) assert isinstance(instance, self.mwcls) - self.assertEqual(instance.dir, '/any/dir') - self.assertEqual(self.stats.get_stats(), {}) - self.assertEqual(instance.reset, True) + assert instance.dir == "/any/dir" + assert instance.stats.get_stats() == {} + assert instance.reset is True def test_init_from_crawler(self): crawler = mock.Mock() # void settings crawler.settings = Settings({}) - self.assertRaises(NotConfigured, self.mwcls.from_crawler, crawler) - with mock.patch('scrapy.utils.project.project_data_dir') as data_dir, \ - mock.patch('scrapy.utils.project.inside_project') as in_project: + with pytest.raises(NotConfigured): + self.mwcls.from_crawler(crawler) + with ( + mock.patch("scrapy.utils.project.project_data_dir") as data_dir, + mock.patch("scrapy.utils.project.inside_project") as in_project, + ): data_dir.return_value = self.temp_dir in_project.return_value = True # simple project_data_dir mock with based settings - crawler.settings = Settings({'DELTAFETCH_ENABLED': True}) + crawler.settings = Settings({"DELTAFETCH_ENABLED": True}) instance = self.mwcls.from_crawler(crawler) assert isinstance(instance, self.mwcls) - self.assertEqual( - instance.dir, os.path.join(self.temp_dir, 'deltafetch')) - self.assertEqual(instance.reset, False) + assert instance.dir == str(self.temp_dir / "deltafetch") + assert instance.reset is False # project_data_dir mock with advanced settings - crawler.settings = Settings({'DELTAFETCH_ENABLED': True, - 'DELTAFETCH_DIR': 'other', - 'DELTAFETCH_RESET': True}) + crawler.settings = Settings( + { + "DELTAFETCH_ENABLED": True, + "DELTAFETCH_DIR": "other", + "DELTAFETCH_RESET": True, + } + ) instance = self.mwcls.from_crawler(crawler) assert isinstance(instance, self.mwcls) - self.assertEqual( - instance.dir, os.path.join(self.temp_dir, 'other')) - self.assertEqual(instance.reset, True) + assert instance.dir == str(self.temp_dir / "other") + assert instance.reset is True def test_spider_opened_new(self): """Middleware should create a .db file if not found.""" - if os.path.exists(self.db_path): - os.remove(self.db_path) - mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) - assert not hasattr(self.mwcls, 'db') + if self.db_path.exists(): + self.db_path.unlink() + mw = self.get_mw(dir=self.temp_dir, reset=False) + assert not hasattr(self.mwcls, "db") mw.spider_opened(self.spider) - assert os.path.isdir(self.temp_dir) - assert os.path.exists(self.db_path) - assert hasattr(mw, 'db') + assert self.temp_dir.is_dir() + assert self.db_path.exists() + assert hasattr(mw, "db") assert mw.db.keys() == [] def test_spider_opened_existing(self): """Middleware should open and use existing and valid .db files.""" self._create_test_db() - mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) - assert not hasattr(self.mwcls, 'db') + mw = self.get_mw(dir=self.temp_dir, reset=False) + assert not hasattr(self.mwcls, "db") mw.spider_opened(self.spider) - assert hasattr(mw, 'db') - for k, v in [ - (b'test_key_1', b'test_v_1'), - (b'test_key_2', b'test_v_2') - ]: + assert hasattr(mw, "db") + for k, v in [(b"test_key_1", b"test_v_1"), (b"test_key_2", b"test_v_2")]: assert mw.db.get(k) == v def test_spider_opened_corrupt_dbfile(self): """Middleware should create a new .db if it cannot open it.""" # create an invalid .db file - with open(self.db_path, "wb") as dbfile: - dbfile.write(b'bad') - mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) - assert not hasattr(self.mwcls, 'db') + with self.db_path.open("wb") as dbfile: + dbfile.write(b"bad") + mw = self.get_mw(dir=self.temp_dir, reset=False) + assert not hasattr(self.mwcls, "db") # file corruption is only detected when opening spider mw.spider_opened(self.spider) - assert os.path.isdir(self.temp_dir) - assert os.path.exists(self.db_path) - assert hasattr(mw, 'db') + assert Path(self.temp_dir).is_dir() + assert Path(self.db_path).exists() + assert hasattr(mw, "db") # and db should be empty (it was re-created) assert mw.db.keys() == [] def test_spider_opened_existing_spider_reset(self): self._create_test_db() - mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) - assert not hasattr(self.mwcls, 'db') + mw = self.get_mw(self.temp_dir, reset=False) + assert not hasattr(self.mwcls, "db") self.spider.deltafetch_reset = True mw.spider_opened(self.spider) assert mw.db.keys() == [] def test_spider_opened_reset_non_existing_db(self): - mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats) - assert not hasattr(self.mwcls, 'db') + mw = self.get_mw(dir=self.temp_dir, reset=True) + assert not hasattr(self.mwcls, "db") self.spider.deltafetch_reset = True mw.spider_opened(self.spider) - assert mw.db.get(b'random') is None - + assert mw.db.get(b"random") is None + def test_spider_opened_recreate(self): self._create_test_db() - mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats) - assert not hasattr(self.mwcls, 'db') + mw = self.get_mw(dir=self.temp_dir, reset=True) + assert not hasattr(self.mwcls, "db") mw.spider_opened(self.spider) - assert hasattr(mw, 'db') + assert hasattr(mw, "db") assert mw.db.keys() == [] def test_spider_closed(self): self._create_test_db() - mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats) + mw = self.get_mw(dir=self.temp_dir, reset=True) mw.spider_opened(self.spider) - assert mw.db.get('random') is None + assert mw.db.get("random") is None mw.spider_closed(self.spider) - with self.assertRaises(Exception) as cm: - # should fail because database closed - mw.db.get('radom') - # self.assertRaisesRegex(, mw.db.get('random')) + with pytest.raises(dbm.error): + mw.db.get("radom") def test_process_spider_output(self): self._create_test_db() - mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) + settings = { + "DELTAFETCH_DIR": self.temp_dir, + "DELTAFETCH_ENABLED": True, + } + crawler = get_crawler(Spider, settings_dict=settings) + mw = self.mwcls.from_crawler(crawler) mw.spider_opened(self.spider) response = mock.Mock() - response.request = Request('http://url', - meta={'deltafetch_key': 'key'}) + response.request = Request("http://url", meta={"deltafetch_key": "key"}) result = [] - self.assertEqual(list(mw.process_spider_output( - response, result, self.spider)), []) + assert not list(mw.process_spider_output(response, result, self.spider)) result = [ # same URL but with new key --> it should be processed - Request('http://url', meta={'deltafetch_key': 'key1'}), - + Request("http://url", meta={"deltafetch_key": "key1"}), # 'test_key_1' is already in the test db --> it should be skipped - Request('http://url1', meta={'deltafetch_key': 'test_key_1'}) + Request("http://url1", meta={"deltafetch_key": "test_key_1"}), ] # so only the 1 request should go through - self.assertEqual(list(mw.process_spider_output( - response, result, self.spider)), [result[0]]) + assert list(mw.process_spider_output(response, result, self.spider)) == [ + result[0] + ] # the skipped "http://url1" should be counted in stats - self.assertEqual(self.stats.get_stats(), {'deltafetch/skipped': 1}) + assert crawler.stats.get_stats() == {"deltafetch/skipped": 1} # b'key' should not be in the db yet as no item was collected yet - self.assertEqual(set(mw.db.keys()), - set([b'test_key_1', - b'test_key_2'])) + assert set(mw.db.keys()) == {b"test_key_1", b"test_key_2"} # if the spider returns items, the request's key is added in db result = [Item(), "not a base item"] - self.assertEqual(list(mw.process_spider_output( - response, result, self.spider)), result) - self.assertEqual(set(mw.db.keys()), - set([b'key', - b'test_key_1', - b'test_key_2'])) - assert mw.db[b'key'] + assert list(mw.process_spider_output(response, result, self.spider)) == result + assert set(mw.db.keys()) == {b"key", b"test_key_1", b"test_key_2"} + assert mw.db[b"key"] def test_process_spider_output_with_ignored_request(self): self._create_test_db() - mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) + settings = { + "DELTAFETCH_DIR": self.temp_dir, + "DELTAFETCH_ENABLED": True, + } + crawler = get_crawler(Spider, settings_dict=settings) + mw = self.mwcls.from_crawler(crawler) mw.spider_opened(self.spider) response = mock.Mock() - response.request = Request('http://url') + response.request = Request("http://url") result = [] - self.assertEqual( - list(mw.process_spider_output(response, result, self.spider)), []) + assert not list(mw.process_spider_output(response, result, self.spider)) result = [ - Request('http://url1'), + Request("http://url1"), # 'url1' is already in the db, but deltafetch_enabled=False # flag is set, URL should be processed. - Request('http://url1', - meta={ - 'deltafetch_enabled': False - }) + Request("http://url1", meta={"deltafetch_enabled": False}), ] # so 2 requests should go through - self.assertEqual( - list(mw.process_spider_output(response, result, self.spider)), - [result[0], result[1]]) + assert list(mw.process_spider_output(response, result, self.spider)) == [ + result[0], + result[1], + ] def test_process_spider_output_dict(self): self._create_test_db() - mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) + mw = self.get_mw(dir=self.temp_dir, reset=False) mw.spider_opened(self.spider) response = mock.Mock() - response.request = Request('http://url', - meta={'deltafetch_key': 'key'}) + response.request = Request("http://url", meta={"deltafetch_key": "key"}) result = [{"somekey": "somevalue"}] - self.assertEqual(list(mw.process_spider_output( - response, result, self.spider)), result) - self.assertEqual(set(mw.db.keys()), - set([b'key', - b'test_key_1', - b'test_key_2'])) - assert mw.db[b'key'] + assert list(mw.process_spider_output(response, result, self.spider)) == result + assert set(mw.db.keys()) == {b"key", b"test_key_1", b"test_key_2"} + assert mw.db[b"key"] def test_process_spider_output_stats(self): self._create_test_db() - mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) + mw = self.get_mw(dir=self.temp_dir) mw.spider_opened(self.spider) response = mock.Mock() - response.request = Request('http://url', - meta={'deltafetch_key': 'key'}) + response.request = Request("http://url", meta={"deltafetch_key": "key"}) result = [] - self.assertEqual(list(mw.process_spider_output( - response, result, self.spider)), []) - self.assertEqual(self.stats.get_stats(), {}) + assert not list(mw.process_spider_output(response, result, self.spider)) + assert mw.stats.get_stats() == {} result = [ - Request('http://url', meta={'deltafetch_key': 'key'}), - Request('http://url1', meta={'deltafetch_key': 'test_key_1'}) + Request("http://url", meta={"deltafetch_key": "key"}), + Request("http://url1", meta={"deltafetch_key": "test_key_1"}), ] - self.assertEqual(list(mw.process_spider_output( - response, result, self.spider)), [result[0]]) - self.assertEqual(self.stats.get_value('deltafetch/skipped'), 1) - result = [Item(), "not a base item"] - self.assertEqual(list(mw.process_spider_output( - response, result, self.spider)), result) - self.assertEqual(self.stats.get_value('deltafetch/stored'), 1) + assert list(mw.process_spider_output(response, result, self.spider)) == [ + result[0] + ] + assert mw.stats.get_value("deltafetch/skipped") == 1 + + @dataclass + class TestItem: + foo: str + + result = [Item(), TestItem("bar")] + assert list(mw.process_spider_output(response, result, self.spider)) == result + assert mw.stats.get_value("deltafetch/stored") == 2 def test_init_from_crawler_legacy(self): # test with subclass not handling passed stats class LegacyDeltaFetchSubClass(self.mwcls): - - def __init__(self, dir, reset=False, *args, **kwargs): - super(LegacyDeltaFetchSubClass, self).__init__(dir=dir, reset=reset) + def __init__(self, dir, reset, *args, **kwargs): + super().__init__(dir=dir, reset=reset) self.something = True crawler = mock.Mock() # void settings crawler.settings = Settings({}) - self.assertRaises(NotConfigured, self.mwcls.from_crawler, crawler) + with pytest.raises(NotConfigured): + self.mwcls.from_crawler(crawler) - with mock.patch('scrapy.utils.project.project_data_dir') as data_dir, \ - mock.patch('scrapy.utils.project.inside_project') as in_project: + with ( + mock.patch("scrapy.utils.project.project_data_dir") as data_dir, + mock.patch("scrapy.utils.project.inside_project") as in_project, + ): data_dir.return_value = self.temp_dir in_project.return_value = True # simple project_data_dir mock with based settings - crawler.settings = Settings({'DELTAFETCH_ENABLED': True}) + crawler.settings = Settings({"DELTAFETCH_ENABLED": True}) instance = LegacyDeltaFetchSubClass.from_crawler(crawler) assert isinstance(instance, self.mwcls) - self.assertEqual( - instance.dir, os.path.join(self.temp_dir, 'deltafetch')) - self.assertEqual(instance.reset, False) + assert instance.dir == str(Path(self.temp_dir) / "deltafetch") + assert instance.reset is False # project_data_dir mock with advanced settings - crawler.settings = Settings({'DELTAFETCH_ENABLED': True, - 'DELTAFETCH_DIR': 'other', - 'DELTAFETCH_RESET': True}) + crawler.settings = Settings( + { + "DELTAFETCH_ENABLED": True, + "DELTAFETCH_DIR": "other", + "DELTAFETCH_RESET": True, + } + ) instance = LegacyDeltaFetchSubClass.from_crawler(crawler) assert isinstance(instance, self.mwcls) - self.assertEqual( - instance.dir, os.path.join(self.temp_dir, 'other')) - self.assertEqual(instance.reset, True) + assert instance.dir == str(Path(self.temp_dir) / "other") + assert instance.reset is True def test_process_spider_output_stats_legacy(self): # testing the subclass not handling stats works at runtime # (i.e. that trying to update stats does not trigger exception) class LegacyDeltaFetchSubClass(self.mwcls): - - def __init__(self, dir, reset=False, *args, **kwargs): - super(LegacyDeltaFetchSubClass, self).__init__(dir=dir, reset=reset) + def __init__(self, dir, *args, reset=False, **kwargs): + super().__init__(dir=dir, reset=reset) self.something = True self._create_test_db() - mw = LegacyDeltaFetchSubClass(self.temp_dir, reset=False) + mw = self.get_mw(dir=self.temp_dir, reset=False, cls=LegacyDeltaFetchSubClass) mw.spider_opened(self.spider) response = mock.Mock() - response.request = Request('http://url', - meta={'deltafetch_key': 'key'}) + response.request = Request("http://url", meta={"deltafetch_key": "key"}) result = [] - self.assertEqual(list(mw.process_spider_output( - response, result, self.spider)), []) - self.assertEqual(self.stats.get_stats(), {}) + assert not list(mw.process_spider_output(response, result, self.spider)) + assert mw.stats.get_stats() == {} result = [ - Request('http://url', meta={'deltafetch_key': 'key'}), - Request('http://url1', meta={'deltafetch_key': 'test_key_1'}) + Request("http://url", meta={"deltafetch_key": "key"}), + Request("http://url1", meta={"deltafetch_key": "test_key_1"}), ] - # stats should not be updated - self.assertEqual(list(mw.process_spider_output( - response, result, self.spider)), [result[0]]) - self.assertEqual(self.stats.get_value('deltafetch/skipped'), None) + assert list(mw.process_spider_output(response, result, self.spider)) == [ + result[0] + ] + assert mw.stats.get_value("deltafetch/skipped") == 1 - result = [Item(), "not a base item"] - self.assertEqual(list(mw.process_spider_output( - response, result, self.spider)), result) - self.assertEqual(self.stats.get_value('deltafetch/stored'), None) + @dataclass + class TestItem: + foo: str + + result = [Item(), TestItem("bar")] + assert list(mw.process_spider_output(response, result, self.spider)) == result + assert mw.stats.get_value("deltafetch/stored") == 2 def test_get_key(self): - mw = self.mwcls(self.temp_dir, reset=True) - test_req1 = Request('http://url1') - self.assertEqual(mw._get_key(test_req1), - to_bytes(request_fingerprint(test_req1))) - test_req2 = Request('http://url2', meta={'deltafetch_key': b'dfkey1'}) - self.assertEqual(mw._get_key(test_req2), b'dfkey1') - - test_req3 = Request('http://url2', meta={'deltafetch_key': u'dfkey1'}) + settings = { + "DELTAFETCH_DIR": self.temp_dir, + "DELTAFETCH_ENABLED": True, + "DELTAFETCH_RESET": True, + } + crawler = get_crawler(Spider, settings_dict=settings) + mw = self.mwcls.from_crawler(crawler) + test_req1 = Request("http://url1") + try: + fingerprint = crawler.request_fingerprinter.fingerprint + except AttributeError: # Scrapy < 2.7.0 + from scrapy.utils.request import request_fingerprint + + fingerprint = request_fingerprint + assert mw._get_key(test_req1) == to_bytes(fingerprint(test_req1)) + test_req2 = Request("http://url2", meta={"deltafetch_key": b"dfkey1"}) + assert mw._get_key(test_req2) == b"dfkey1" + + test_req3 = Request("http://url2", meta={"deltafetch_key": "dfkey1"}) # key will be converted to bytes - self.assertEqual(mw._get_key(test_req3), b'dfkey1') + assert mw._get_key(test_req3) == b"dfkey1" def _create_test_db(self): # truncate test db if there were failed tests - db = dbm.open(self.db_path, 'n') - db[b'test_key_1'] = b'test_v_1' - db[b'test_key_2'] = b'test_v_2' - db.close() + with dbm.open(str(self.db_path), "n") as db: + db[b"test_key_1"] = b"test_v_1" + db[b"test_key_2"] = b"test_v_2" diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..6c4228c --- /dev/null +++ b/tox.ini @@ -0,0 +1,50 @@ +[tox] +envlist = pre-commit,mypy,pylint,twinecheck,min,py39,py310,py311,py312,py313 + +[testenv] +deps = + pytest + pytest-cov + pytest-benchmark +commands = + pytest \ + --cov=scrapy_deltafetch \ + --cov-config=pyproject.toml \ + --cov-report=xml \ + --cov-report= \ + {posargs:scrapy_deltafetch tests} + +[testenv:min] +basepython = python3.9 +deps = + {[testenv]deps} + scrapy==1.1.0 + +[testenv:pre-commit] +deps = + pre-commit +commands = + pre-commit run {posargs:--all-files} + +[testenv:mypy] +deps = + {[testenv]deps} + mypy==1.15.0 +commands = + mypy {posargs:scrapy_deltafetch tests} + +# https://github.com/astral-sh/ruff/issues/970 +[testenv:pylint] +deps = + {[testenv]deps} + pylint==3.3.4 +commands = + pylint {posargs:scrapy_deltafetch tests} + +[testenv:twinecheck] +deps = + twine==6.1.0 + build==1.2.2.post1 +commands = + python -m build --sdist + twine check dist/*