From 3c68465bf6b33cc470e57f58aedf47b9e42ec028 Mon Sep 17 00:00:00 2001 From: fsmeraldi Date: Mon, 24 Feb 2025 15:35:13 +0000 Subject: [PATCH 01/17] Resolve request_fingerprint deprecation --- scrapy_deltafetch/middleware.py | 9 +++++---- tests/test_deltafetch.py | 6 ++++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/scrapy_deltafetch/middleware.py b/scrapy_deltafetch/middleware.py index 9f6f1b4..a255000 100644 --- a/scrapy_deltafetch/middleware.py +++ b/scrapy_deltafetch/middleware.py @@ -5,7 +5,7 @@ from scrapy.http import Request from scrapy.item import Item -from scrapy.utils.request import request_fingerprint +from scrapy.utils.request import RequestFingerprinter from scrapy.utils.project import data_path from scrapy.utils.python import to_bytes from scrapy.exceptions import NotConfigured @@ -26,10 +26,11 @@ class DeltaFetch(object): intensive). """ - def __init__(self, dir, reset=False, stats=None): + def __init__(self, dir, reset=False, stats=None, crawler=None): self.dir = dir self.reset = reset self.stats = stats + self.fingerprint=RequestFingerprinter(crawler).fingerprint @classmethod def from_crawler(cls, crawler): @@ -38,7 +39,7 @@ def from_crawler(cls, crawler): raise NotConfigured dir = data_path(s.get('DELTAFETCH_DIR', 'deltafetch')) reset = s.getbool('DELTAFETCH_RESET') - o = cls(dir, reset, crawler.stats) + o = cls(dir, reset, crawler.stats, crawler) crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) return o @@ -79,7 +80,7 @@ def process_spider_output(self, response, result, spider): yield r def _get_key(self, request): - key = request.meta.get('deltafetch_key') or request_fingerprint(request) + key = request.meta.get('deltafetch_key') or self.fingerprint(request) return to_bytes(key) def _is_enabled_for_request(self, request): diff --git a/tests/test_deltafetch.py b/tests/test_deltafetch.py index 362bf1e..7fafb58 100644 --- a/tests/test_deltafetch.py +++ b/tests/test_deltafetch.py @@ -9,7 +9,7 @@ from scrapy.spiders import Spider from scrapy.settings import Settings from scrapy.exceptions import NotConfigured -from scrapy.utils.request import request_fingerprint +from scrapy.utils.request import RequestFingerprinter from scrapy.utils.python import to_bytes from scrapy.statscollectors import StatsCollector from scrapy.utils.test import get_crawler @@ -318,8 +318,10 @@ def __init__(self, dir, reset=False, *args, **kwargs): def test_get_key(self): mw = self.mwcls(self.temp_dir, reset=True) test_req1 = Request('http://url1') + crawler = get_crawler(Spider) + fingerprint=RequestFingerprinter(crawler).fingerprint self.assertEqual(mw._get_key(test_req1), - to_bytes(request_fingerprint(test_req1))) + to_bytes(fingerprint(test_req1))) test_req2 = Request('http://url2', meta={'deltafetch_key': b'dfkey1'}) self.assertEqual(mw._get_key(test_req2), b'dfkey1') From b66c43d1e04fb5205908e9ac5fcd3bea7db03e37 Mon Sep 17 00:00:00 2001 From: fsmeraldi Date: Tue, 25 Feb 2025 16:18:23 +0000 Subject: [PATCH 02/17] Get fingerprint function through crawler, import legacy function otherwise --- scrapy_deltafetch/middleware.py | 12 +++++++++++- tests/test_deltafetch.py | 13 +++++++++++-- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/scrapy_deltafetch/middleware.py b/scrapy_deltafetch/middleware.py index a255000..f7c2e0e 100644 --- a/scrapy_deltafetch/middleware.py +++ b/scrapy_deltafetch/middleware.py @@ -30,7 +30,17 @@ def __init__(self, dir, reset=False, stats=None, crawler=None): self.dir = dir self.reset = reset self.stats = stats - self.fingerprint=RequestFingerprinter(crawler).fingerprint + if crawler and hasattr(crawler, 'request_fingerprinter'): + self.fingerprint=crawler.request_fingerprinter.fingerprint + else: + try: + # compatibility with Scrapy <2.7.0 + from scrapy.utils.request import request_fingerprint + self.fingerprint=request_fingerprint + except ImportError: + # use the new default + from scrapy.utils.request import fingerprint + self.fingerprint=fingerprint @classmethod def from_crawler(cls, crawler): diff --git a/tests/test_deltafetch.py b/tests/test_deltafetch.py index 7fafb58..9d8be4a 100644 --- a/tests/test_deltafetch.py +++ b/tests/test_deltafetch.py @@ -9,11 +9,17 @@ from scrapy.spiders import Spider from scrapy.settings import Settings from scrapy.exceptions import NotConfigured -from scrapy.utils.request import RequestFingerprinter from scrapy.utils.python import to_bytes from scrapy.statscollectors import StatsCollector from scrapy.utils.test import get_crawler +try: + from scrapy.utils.request import request_fingerprint + _legacy_fingerprint=True +except ImportError: + from scrapy.utils.request import RequestFingerprinter + _legacy_fingerprint=False + from scrapy_deltafetch.middleware import DeltaFetch @@ -319,7 +325,10 @@ def test_get_key(self): mw = self.mwcls(self.temp_dir, reset=True) test_req1 = Request('http://url1') crawler = get_crawler(Spider) - fingerprint=RequestFingerprinter(crawler).fingerprint + if _legacy_fingerprint: + fingerprint=request_fingerprint + else: + fingerprint=RequestFingerprinter(crawler).fingerprint self.assertEqual(mw._get_key(test_req1), to_bytes(fingerprint(test_req1))) test_req2 = Request('http://url2', meta={'deltafetch_key': b'dfkey1'}) From 0bd04d8ad9bc2fb4455cd944d4896b54b8314e56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 25 Feb 2025 20:48:32 +0100 Subject: [PATCH 03/17] Move the logic to from_crawler to minimize backward incompatibility --- scrapy_deltafetch/middleware.py | 23 ++++++++++------------- tests/test_deltafetch.py | 22 ++++++++++++++++------ 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/scrapy_deltafetch/middleware.py b/scrapy_deltafetch/middleware.py index f7c2e0e..cca97e9 100644 --- a/scrapy_deltafetch/middleware.py +++ b/scrapy_deltafetch/middleware.py @@ -26,21 +26,10 @@ class DeltaFetch(object): intensive). """ - def __init__(self, dir, reset=False, stats=None, crawler=None): + def __init__(self, dir, reset=False, stats=None): self.dir = dir self.reset = reset self.stats = stats - if crawler and hasattr(crawler, 'request_fingerprinter'): - self.fingerprint=crawler.request_fingerprinter.fingerprint - else: - try: - # compatibility with Scrapy <2.7.0 - from scrapy.utils.request import request_fingerprint - self.fingerprint=request_fingerprint - except ImportError: - # use the new default - from scrapy.utils.request import fingerprint - self.fingerprint=fingerprint @classmethod def from_crawler(cls, crawler): @@ -49,9 +38,17 @@ def from_crawler(cls, crawler): raise NotConfigured dir = data_path(s.get('DELTAFETCH_DIR', 'deltafetch')) reset = s.getbool('DELTAFETCH_RESET') - o = cls(dir, reset, crawler.stats, crawler) + o = cls(dir, reset, crawler.stats) crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) + + try: + o.fingerprint = crawler.request_fingerprinter.fingerprint + except AttributeError: + from scrapy.utils.request import request_fingerprint + + o.fingerprint = request_fingerprint + return o def spider_opened(self, spider): diff --git a/tests/test_deltafetch.py b/tests/test_deltafetch.py index 9d8be4a..0a99579 100644 --- a/tests/test_deltafetch.py +++ b/tests/test_deltafetch.py @@ -130,7 +130,7 @@ def test_spider_opened_reset_non_existing_db(self): self.spider.deltafetch_reset = True mw.spider_opened(self.spider) assert mw.db.get(b'random') is None - + def test_spider_opened_recreate(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats) @@ -191,7 +191,12 @@ def test_process_spider_output(self): def test_process_spider_output_with_ignored_request(self): self._create_test_db() - mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) + settings = { + "DELTAFETCH_DIR": self.temp_dir, + "DELTAFETCH_ENABLED": True, + } + crawler = get_crawler(Spider, settings_dict=settings) + mw = self.mwcls.from_crawler(crawler) mw.spider_opened(self.spider) response = mock.Mock() response.request = Request('http://url') @@ -322,13 +327,18 @@ def __init__(self, dir, reset=False, *args, **kwargs): self.assertEqual(self.stats.get_value('deltafetch/stored'), None) def test_get_key(self): - mw = self.mwcls(self.temp_dir, reset=True) + settings = { + "DELTAFETCH_DIR": self.temp_dir, + "DELTAFETCH_ENABLED": True, + "DELTAFETCH_RESET": True, + } + crawler = get_crawler(Spider, settings_dict=settings) + mw = self.mwcls.from_crawler(crawler) test_req1 = Request('http://url1') - crawler = get_crawler(Spider) if _legacy_fingerprint: - fingerprint=request_fingerprint + fingerprint = request_fingerprint else: - fingerprint=RequestFingerprinter(crawler).fingerprint + fingerprint = RequestFingerprinter.from_crawler(crawler).fingerprint self.assertEqual(mw._get_key(test_req1), to_bytes(fingerprint(test_req1))) test_req2 = Request('http://url2', meta={'deltafetch_key': b'dfkey1'}) From a547f6573bdad06caae0b836bc7dcb1bc4a83590 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 25 Feb 2025 20:51:10 +0100 Subject: [PATCH 04/17] Fix CI --- .github/workflows/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 25fabf7..6d6cdba 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -9,7 +9,7 @@ on: jobs: build: - runs-on: ubuntu-latest + runs-on: ubuntu-20.04 strategy: matrix: python-version: [3.5, 3.6, 3.7, 3.8, 3.9] @@ -38,4 +38,4 @@ jobs: pip install -r tests/requirements-test.txt - name: Test with pytest run: | - pytest \ No newline at end of file + pytest From a43a7af50d28e7a3e73c5bd951d9d0e69534a53b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 25 Feb 2025 20:53:16 +0100 Subject: [PATCH 05/17] Disable fail-fast in CI to fidn out which Python versions work --- .github/workflows/main.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 6d6cdba..6ec9a77 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -11,6 +11,7 @@ jobs: runs-on: ubuntu-20.04 strategy: + fail-fast: false matrix: python-version: [3.5, 3.6, 3.7, 3.8, 3.9] From d656da23c7c72c2f96cf99852a5813bb082596a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 25 Feb 2025 20:55:13 +0100 Subject: [PATCH 06/17] CI: Update Python setup --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 6ec9a77..82ed6b1 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -22,7 +22,7 @@ jobs: sudo apt-get install libdb-dev - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Cache pip From f918c71e2fc4f6d97f4b681f30ae40a25d4b2178 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 25 Feb 2025 20:59:45 +0100 Subject: [PATCH 07/17] Drop support for end-of-life Python versions --- .github/workflows/main.yml | 9 +++++++-- setup.py | 11 ++++++----- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 82ed6b1..1f079fd 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -9,11 +9,16 @@ on: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest strategy: fail-fast: false matrix: - python-version: [3.5, 3.6, 3.7, 3.8, 3.9] + include: + - python-version: "3.9" + - python-version: "3.10" + - python-version: "3.11" + - python-version: "3.12" + - python-version: "3.13" steps: - uses: actions/checkout@v2 diff --git a/setup.py b/setup.py index ad86588..6bcd948 100644 --- a/setup.py +++ b/setup.py @@ -17,11 +17,12 @@ 'Operating System :: OS Independent', 'Programming Language :: Python', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', + 'Programming Language :: Python :: 3.13', ], - install_requires=['Scrapy>=1.1.0'] + install_requires=['Scrapy>=1.1.0'], + python_requires='>=3.9', ) From 3352cbc7fbc149404868a39af49fa466ad4d6928 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 25 Feb 2025 21:40:07 +0100 Subject: [PATCH 08/17] Use ruff --- .pre-commit-config.yaml | 7 + pyproject.toml | 105 ++++++++++++ scrapy_deltafetch/__init__.py | 2 +- scrapy_deltafetch/middleware.py | 65 ++++---- setup.py | 44 ++--- tests/benchmark.py | 12 +- tests/test_deltafetch.py | 284 +++++++++++++++----------------- 7 files changed, 306 insertions(+), 213 deletions(-) create mode 100644 .pre-commit-config.yaml create mode 100644 pyproject.toml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..d3e17be --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,7 @@ +repos: +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.9.7 + hooks: + - id: ruff + args: [ --fix ] + - id: ruff-format diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..5bc5e1f --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,105 @@ +[tool.ruff.lint] +extend-select = [ + # flake8-bugbear + "B", + # flake8-comprehensions + "C4", + # pydocstyle + "D", + # flake8-future-annotations + "FA", + # flynt + "FLY", + # refurb + "FURB", + # isort + "I", + # flake8-implicit-str-concat + "ISC", + # flake8-logging + "LOG", + # Perflint + "PERF", + # pygrep-hooks + "PGH", + # flake8-pie + "PIE", + # pylint + "PL", + # flake8-pytest-style + "PT", + # flake8-use-pathlib + "PTH", + # flake8-pyi + "PYI", + # flake8-quotes + "Q", + # flake8-return + "RET", + # flake8-raise + "RSE", + # Ruff-specific rules + "RUF", + # flake8-bandit + "S", + # flake8-simplify + "SIM", + # flake8-slots + "SLOT", + # flake8-debugger + "T10", + # flake8-type-checking + "TC", + # pyupgrade + "UP", + # pycodestyle warnings + "W", + # flake8-2020 + "YTT", +] +ignore = [ + # Missing docstring in public module + "D100", + # Missing docstring in public class + "D101", + # Missing docstring in public function + "D103", + # Missing docstring in public package + "D104", + # Missing docstring in magic method + "D105", + # Missing docstring in __init__ + "D107", + # One-line docstring should fit on one line with quotes + "D200", + # No blank lines allowed after function docstring + "D202", + # 1 blank line required between summary line and description + "D205", + # Multi-line docstring closing quotes should be on a separate line + "D209", + # First line should end with a period + "D400", + # First line should be in imperative mood; try rephrasing + "D401", + # First line should not be the function's "signature" + "D402", + # Too many return statements + "PLR0911", + # Too many branches + "PLR0912", + # Too many arguments in function definition + "PLR0913", + # Too many statements + "PLR0915", + # Magic value used in comparison + "PLR2004", + # Mutable class attributes should be annotated with `typing.ClassVar` + "RUF012", + # Use of `assert` detected + "S101", +] + +[tool.ruff.lint.per-file-ignores] +# D102: Missing docstring in public method +"tests/**" = ["D102"] diff --git a/scrapy_deltafetch/__init__.py b/scrapy_deltafetch/__init__.py index 40b1066..a229437 100644 --- a/scrapy_deltafetch/__init__.py +++ b/scrapy_deltafetch/__init__.py @@ -1,4 +1,4 @@ from .middleware import DeltaFetch - +__all__ = ["DeltaFetch"] __version__ = "2.0.1" diff --git a/scrapy_deltafetch/middleware.py b/scrapy_deltafetch/middleware.py index 9f6f1b4..fafbb79 100644 --- a/scrapy_deltafetch/middleware.py +++ b/scrapy_deltafetch/middleware.py @@ -1,24 +1,22 @@ +import dbm import logging -import os import time -import dbm +from pathlib import Path +from scrapy import signals +from scrapy.exceptions import NotConfigured from scrapy.http import Request from scrapy.item import Item -from scrapy.utils.request import request_fingerprint from scrapy.utils.project import data_path from scrapy.utils.python import to_bytes -from scrapy.exceptions import NotConfigured -from scrapy import signals - +from scrapy.utils.request import request_fingerprint logger = logging.getLogger(__name__) -class DeltaFetch(object): - """ - This is a spider middleware to ignore requests to pages containing items - seen in previous crawls of the same spider, thus producing a "delta crawl" +class DeltaFetch: + """Spider middleware to ignore requests to pages containing items seen in + previous crawls of the same spider, thus producing a "delta crawl" containing only new items. This also speeds up the crawl, by reducing the number of requests that need @@ -32,56 +30,57 @@ def __init__(self, dir, reset=False, stats=None): self.stats = stats @classmethod - def from_crawler(cls, crawler): + def from_crawler(cls, crawler): # noqa: D102 s = crawler.settings - if not s.getbool('DELTAFETCH_ENABLED'): + if not s.getbool("DELTAFETCH_ENABLED"): raise NotConfigured - dir = data_path(s.get('DELTAFETCH_DIR', 'deltafetch')) - reset = s.getbool('DELTAFETCH_RESET') + dir = data_path(s.get("DELTAFETCH_DIR", "deltafetch")) + reset = s.getbool("DELTAFETCH_RESET") o = cls(dir, reset, crawler.stats) crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) return o - def spider_opened(self, spider): - if not os.path.exists(self.dir): - os.makedirs(self.dir) + def spider_opened(self, spider): # noqa: D102 + dir = Path(self.dir) + dir.mkdir(parents=True, exist_ok=True) # TODO may be tricky, as there may be different paths on systems - dbpath = os.path.join(self.dir, '%s.db' % spider.name) - reset = self.reset or getattr(spider, 'deltafetch_reset', False) - flag = 'n' if reset else 'c' + dbpath = dir / f"{spider.name}.db" + reset = self.reset or getattr(spider, "deltafetch_reset", False) + flag = "n" if reset else "c" try: - self.db = dbm.open(dbpath, flag=flag) + self.db = dbm.open(dbpath, flag=flag) # noqa: SIM115 except Exception: - logger.warning("Failed to open DeltaFetch database at %s, " - "trying to recreate it" % dbpath) - if os.path.exists(dbpath): - os.remove(dbpath) - self.db = dbm.open(dbpath, 'c') + logger.warning( + f"Failed to open DeltaFetch database at {dbpath}, trying to recreate it" + ) + if dbpath.exists(): + dbpath.unlink() + self.db = dbm.open(dbpath, "c") # noqa: SIM115 - def spider_closed(self, spider): + def spider_closed(self, spider): # noqa: D102 self.db.close() - def process_spider_output(self, response, result, spider): + def process_spider_output(self, response, result, spider): # noqa: D102 for r in result: if isinstance(r, Request): key = self._get_key(r) if key in self.db and self._is_enabled_for_request(r): - logger.info("Ignoring already visited: %s" % r) + logger.info(f"Ignoring already visited: {r}") if self.stats: - self.stats.inc_value('deltafetch/skipped', spider=spider) + self.stats.inc_value("deltafetch/skipped", spider=spider) continue elif isinstance(r, (Item, dict)): key = self._get_key(response.request) self.db[key] = str(time.time()) if self.stats: - self.stats.inc_value('deltafetch/stored', spider=spider) + self.stats.inc_value("deltafetch/stored", spider=spider) yield r def _get_key(self, request): - key = request.meta.get('deltafetch_key') or request_fingerprint(request) + key = request.meta.get("deltafetch_key") or request_fingerprint(request) return to_bytes(key) def _is_enabled_for_request(self, request): # Gives you option to disable deltafetch for some requests - return request.meta.get('deltafetch_enabled', True) + return request.meta.get("deltafetch_enabled", True) diff --git a/setup.py b/setup.py index ad86588..a5bb696 100644 --- a/setup.py +++ b/setup.py @@ -1,27 +1,29 @@ +from pathlib import Path + from setuptools import setup setup( - name='scrapy-deltafetch', - version='2.0.1', - license='BSD', - description='Scrapy middleware to ignore previously crawled pages', - long_description=open('README.rst').read(), - author='Zyte', - author_email='opensource@zyte.com', - url='http://github.com/scrapy-plugins/scrapy-deltafetch', - packages=['scrapy_deltafetch'], - platforms=['Any'], + name="scrapy-deltafetch", + version="2.0.1", + license="BSD", + description="Scrapy middleware to ignore previously crawled pages", + long_description=Path("README.rst").read_text(encoding="utf-8"), + author="Zyte", + author_email="opensource@zyte.com", + url="http://github.com/scrapy-plugins/scrapy-deltafetch", + packages=["scrapy_deltafetch"], + platforms=["Any"], classifiers=[ - 'Development Status :: 4 - Beta', - 'License :: OSI Approved :: BSD License', - 'Operating System :: OS Independent', - 'Programming Language :: Python', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', + "Development Status :: 4 - Beta", + "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", ], - install_requires=['Scrapy>=1.1.0'] + install_requires=["Scrapy>=1.1.0"], ) diff --git a/tests/benchmark.py b/tests/benchmark.py index a8edee5..555500b 100644 --- a/tests/benchmark.py +++ b/tests/benchmark.py @@ -1,6 +1,6 @@ import tempfile +from unittest import mock -import mock from scrapy import Request, Spider from scrapy.statscollectors import StatsCollector from scrapy.utils.test import get_crawler @@ -9,7 +9,7 @@ def benchmark_middleware(result): - spider_name = 'df_tests' + spider_name = "df_tests" spider = Spider(spider_name) temp_dir = tempfile.gettempdir() crawler = get_crawler(Spider) @@ -17,15 +17,15 @@ def benchmark_middleware(result): mw = DeltaFetch(temp_dir, reset=False, stats=stats) mw.spider_opened(spider) response = mock.Mock() - response.request = Request('http://url', - meta={'deltafetch_key': 'key'}) + response.request = Request("http://url", meta={"deltafetch_key": "key"}) - for x in mw.process_spider_output(response, result, spider): + for _x in mw.process_spider_output(response, result, spider): pass + def test_middleware(benchmark): result = [] for x in range(50000): - request = Request(f'https://{x}') + request = Request(f"https://{x}") result.append(request) result = benchmark(benchmark_middleware, result) diff --git a/tests/test_deltafetch.py b/tests/test_deltafetch.py index 362bf1e..5143790 100644 --- a/tests/test_deltafetch.py +++ b/tests/test_deltafetch.py @@ -1,111 +1,112 @@ -from unittest import TestCase, skipIf - -import os import dbm -import mock import tempfile +from pathlib import Path +from unittest import TestCase, mock + +import pytest from scrapy import Request +from scrapy.exceptions import NotConfigured from scrapy.item import Item -from scrapy.spiders import Spider from scrapy.settings import Settings -from scrapy.exceptions import NotConfigured -from scrapy.utils.request import request_fingerprint -from scrapy.utils.python import to_bytes +from scrapy.spiders import Spider from scrapy.statscollectors import StatsCollector +from scrapy.utils.python import to_bytes +from scrapy.utils.request import request_fingerprint from scrapy.utils.test import get_crawler from scrapy_deltafetch.middleware import DeltaFetch class DeltaFetchTestCase(TestCase): - mwcls = DeltaFetch def setUp(self): - self.spider_name = 'df_tests' + self.spider_name = "df_tests" self.spider = Spider(self.spider_name) # DeltaFetch creates .db files named after the spider's name - self.temp_dir = tempfile.gettempdir() - self.db_path = os.path.join(self.temp_dir, '%s.db' % self.spider.name) + self.temp_dir = Path(tempfile.gettempdir()) + self.db_path = self.temp_dir / f"{self.spider.name}.db" crawler = get_crawler(Spider) self.stats = StatsCollector(crawler) def test_init(self): # path format is any, the folder is not created - instance = self.mwcls('/any/dir', True, stats=self.stats) + instance = self.mwcls("/any/dir", True, stats=self.stats) assert isinstance(instance, self.mwcls) - self.assertEqual(instance.dir, '/any/dir') - self.assertEqual(self.stats.get_stats(), {}) - self.assertEqual(instance.reset, True) + assert instance.dir == "/any/dir" + assert self.stats.get_stats() == {} + assert instance.reset is True def test_init_from_crawler(self): crawler = mock.Mock() # void settings crawler.settings = Settings({}) - self.assertRaises(NotConfigured, self.mwcls.from_crawler, crawler) - with mock.patch('scrapy.utils.project.project_data_dir') as data_dir, \ - mock.patch('scrapy.utils.project.inside_project') as in_project: + with pytest.raises(NotConfigured): + self.mwcls.from_crawler(crawler) + with ( + mock.patch("scrapy.utils.project.project_data_dir") as data_dir, + mock.patch("scrapy.utils.project.inside_project") as in_project, + ): data_dir.return_value = self.temp_dir in_project.return_value = True # simple project_data_dir mock with based settings - crawler.settings = Settings({'DELTAFETCH_ENABLED': True}) + crawler.settings = Settings({"DELTAFETCH_ENABLED": True}) instance = self.mwcls.from_crawler(crawler) assert isinstance(instance, self.mwcls) - self.assertEqual( - instance.dir, os.path.join(self.temp_dir, 'deltafetch')) - self.assertEqual(instance.reset, False) + assert instance.dir == str(self.temp_dir / "deltafetch") + assert instance.reset is False # project_data_dir mock with advanced settings - crawler.settings = Settings({'DELTAFETCH_ENABLED': True, - 'DELTAFETCH_DIR': 'other', - 'DELTAFETCH_RESET': True}) + crawler.settings = Settings( + { + "DELTAFETCH_ENABLED": True, + "DELTAFETCH_DIR": "other", + "DELTAFETCH_RESET": True, + } + ) instance = self.mwcls.from_crawler(crawler) assert isinstance(instance, self.mwcls) - self.assertEqual( - instance.dir, os.path.join(self.temp_dir, 'other')) - self.assertEqual(instance.reset, True) + assert instance.dir == str(self.temp_dir / "other") + assert instance.reset is True def test_spider_opened_new(self): """Middleware should create a .db file if not found.""" - if os.path.exists(self.db_path): - os.remove(self.db_path) + if self.db_path.exists(): + self.db_path.unlink() mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) - assert not hasattr(self.mwcls, 'db') + assert not hasattr(self.mwcls, "db") mw.spider_opened(self.spider) - assert os.path.isdir(self.temp_dir) - assert os.path.exists(self.db_path) - assert hasattr(mw, 'db') + assert self.temp_dir.is_dir() + assert self.db_path.exists() + assert hasattr(mw, "db") assert mw.db.keys() == [] def test_spider_opened_existing(self): """Middleware should open and use existing and valid .db files.""" self._create_test_db() mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) - assert not hasattr(self.mwcls, 'db') + assert not hasattr(self.mwcls, "db") mw.spider_opened(self.spider) - assert hasattr(mw, 'db') - for k, v in [ - (b'test_key_1', b'test_v_1'), - (b'test_key_2', b'test_v_2') - ]: + assert hasattr(mw, "db") + for k, v in [(b"test_key_1", b"test_v_1"), (b"test_key_2", b"test_v_2")]: assert mw.db.get(k) == v def test_spider_opened_corrupt_dbfile(self): """Middleware should create a new .db if it cannot open it.""" # create an invalid .db file - with open(self.db_path, "wb") as dbfile: - dbfile.write(b'bad') + with self.db_path.open("wb") as dbfile: + dbfile.write(b"bad") mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) - assert not hasattr(self.mwcls, 'db') + assert not hasattr(self.mwcls, "db") # file corruption is only detected when opening spider mw.spider_opened(self.spider) - assert os.path.isdir(self.temp_dir) - assert os.path.exists(self.db_path) - assert hasattr(mw, 'db') + assert Path(self.temp_dir).is_dir() + assert Path(self.db_path).exists() + assert hasattr(mw, "db") # and db should be empty (it was re-created) assert mw.db.keys() == [] @@ -113,223 +114,202 @@ def test_spider_opened_corrupt_dbfile(self): def test_spider_opened_existing_spider_reset(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) - assert not hasattr(self.mwcls, 'db') + assert not hasattr(self.mwcls, "db") self.spider.deltafetch_reset = True mw.spider_opened(self.spider) assert mw.db.keys() == [] def test_spider_opened_reset_non_existing_db(self): mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats) - assert not hasattr(self.mwcls, 'db') + assert not hasattr(self.mwcls, "db") self.spider.deltafetch_reset = True mw.spider_opened(self.spider) - assert mw.db.get(b'random') is None - + assert mw.db.get(b"random") is None + def test_spider_opened_recreate(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats) - assert not hasattr(self.mwcls, 'db') + assert not hasattr(self.mwcls, "db") mw.spider_opened(self.spider) - assert hasattr(mw, 'db') + assert hasattr(mw, "db") assert mw.db.keys() == [] def test_spider_closed(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats) mw.spider_opened(self.spider) - assert mw.db.get('random') is None + assert mw.db.get("random") is None mw.spider_closed(self.spider) - with self.assertRaises(Exception) as cm: - # should fail because database closed - mw.db.get('radom') - # self.assertRaisesRegex(, mw.db.get('random')) + with pytest.raises(dbm.error): + mw.db.get("radom") def test_process_spider_output(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) mw.spider_opened(self.spider) response = mock.Mock() - response.request = Request('http://url', - meta={'deltafetch_key': 'key'}) + response.request = Request("http://url", meta={"deltafetch_key": "key"}) result = [] - self.assertEqual(list(mw.process_spider_output( - response, result, self.spider)), []) + assert list(mw.process_spider_output(response, result, self.spider)) == [] result = [ # same URL but with new key --> it should be processed - Request('http://url', meta={'deltafetch_key': 'key1'}), - + Request("http://url", meta={"deltafetch_key": "key1"}), # 'test_key_1' is already in the test db --> it should be skipped - Request('http://url1', meta={'deltafetch_key': 'test_key_1'}) + Request("http://url1", meta={"deltafetch_key": "test_key_1"}), ] # so only the 1 request should go through - self.assertEqual(list(mw.process_spider_output( - response, result, self.spider)), [result[0]]) + assert list(mw.process_spider_output(response, result, self.spider)) == [ + result[0] + ] # the skipped "http://url1" should be counted in stats - self.assertEqual(self.stats.get_stats(), {'deltafetch/skipped': 1}) + assert self.stats.get_stats() == {"deltafetch/skipped": 1} # b'key' should not be in the db yet as no item was collected yet - self.assertEqual(set(mw.db.keys()), - set([b'test_key_1', - b'test_key_2'])) + assert set(mw.db.keys()) == {b"test_key_1", b"test_key_2"} # if the spider returns items, the request's key is added in db result = [Item(), "not a base item"] - self.assertEqual(list(mw.process_spider_output( - response, result, self.spider)), result) - self.assertEqual(set(mw.db.keys()), - set([b'key', - b'test_key_1', - b'test_key_2'])) - assert mw.db[b'key'] + assert list(mw.process_spider_output(response, result, self.spider)) == result + assert set(mw.db.keys()) == {b"key", b"test_key_1", b"test_key_2"} + assert mw.db[b"key"] def test_process_spider_output_with_ignored_request(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) mw.spider_opened(self.spider) response = mock.Mock() - response.request = Request('http://url') + response.request = Request("http://url") result = [] - self.assertEqual( - list(mw.process_spider_output(response, result, self.spider)), []) + assert list(mw.process_spider_output(response, result, self.spider)) == [] result = [ - Request('http://url1'), + Request("http://url1"), # 'url1' is already in the db, but deltafetch_enabled=False # flag is set, URL should be processed. - Request('http://url1', - meta={ - 'deltafetch_enabled': False - }) + Request("http://url1", meta={"deltafetch_enabled": False}), ] # so 2 requests should go through - self.assertEqual( - list(mw.process_spider_output(response, result, self.spider)), - [result[0], result[1]]) + assert list(mw.process_spider_output(response, result, self.spider)) == [ + result[0], + result[1], + ] def test_process_spider_output_dict(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) mw.spider_opened(self.spider) response = mock.Mock() - response.request = Request('http://url', - meta={'deltafetch_key': 'key'}) + response.request = Request("http://url", meta={"deltafetch_key": "key"}) result = [{"somekey": "somevalue"}] - self.assertEqual(list(mw.process_spider_output( - response, result, self.spider)), result) - self.assertEqual(set(mw.db.keys()), - set([b'key', - b'test_key_1', - b'test_key_2'])) - assert mw.db[b'key'] + assert list(mw.process_spider_output(response, result, self.spider)) == result + assert set(mw.db.keys()) == {b"key", b"test_key_1", b"test_key_2"} + assert mw.db[b"key"] def test_process_spider_output_stats(self): self._create_test_db() mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) mw.spider_opened(self.spider) response = mock.Mock() - response.request = Request('http://url', - meta={'deltafetch_key': 'key'}) + response.request = Request("http://url", meta={"deltafetch_key": "key"}) result = [] - self.assertEqual(list(mw.process_spider_output( - response, result, self.spider)), []) - self.assertEqual(self.stats.get_stats(), {}) + assert list(mw.process_spider_output(response, result, self.spider)) == [] + assert self.stats.get_stats() == {} result = [ - Request('http://url', meta={'deltafetch_key': 'key'}), - Request('http://url1', meta={'deltafetch_key': 'test_key_1'}) + Request("http://url", meta={"deltafetch_key": "key"}), + Request("http://url1", meta={"deltafetch_key": "test_key_1"}), ] - self.assertEqual(list(mw.process_spider_output( - response, result, self.spider)), [result[0]]) - self.assertEqual(self.stats.get_value('deltafetch/skipped'), 1) + assert list(mw.process_spider_output(response, result, self.spider)) == [ + result[0] + ] + assert self.stats.get_value("deltafetch/skipped") == 1 result = [Item(), "not a base item"] - self.assertEqual(list(mw.process_spider_output( - response, result, self.spider)), result) - self.assertEqual(self.stats.get_value('deltafetch/stored'), 1) + assert list(mw.process_spider_output(response, result, self.spider)) == result + assert self.stats.get_value("deltafetch/stored") == 1 def test_init_from_crawler_legacy(self): # test with subclass not handling passed stats class LegacyDeltaFetchSubClass(self.mwcls): - def __init__(self, dir, reset=False, *args, **kwargs): - super(LegacyDeltaFetchSubClass, self).__init__(dir=dir, reset=reset) + super().__init__(dir=dir, reset=reset) self.something = True crawler = mock.Mock() # void settings crawler.settings = Settings({}) - self.assertRaises(NotConfigured, self.mwcls.from_crawler, crawler) + with pytest.raises(NotConfigured): + self.mwcls.from_crawler(crawler) - with mock.patch('scrapy.utils.project.project_data_dir') as data_dir, \ - mock.patch('scrapy.utils.project.inside_project') as in_project: + with ( + mock.patch("scrapy.utils.project.project_data_dir") as data_dir, + mock.patch("scrapy.utils.project.inside_project") as in_project, + ): data_dir.return_value = self.temp_dir in_project.return_value = True # simple project_data_dir mock with based settings - crawler.settings = Settings({'DELTAFETCH_ENABLED': True}) + crawler.settings = Settings({"DELTAFETCH_ENABLED": True}) instance = LegacyDeltaFetchSubClass.from_crawler(crawler) assert isinstance(instance, self.mwcls) - self.assertEqual( - instance.dir, os.path.join(self.temp_dir, 'deltafetch')) - self.assertEqual(instance.reset, False) + assert instance.dir == str(Path(self.temp_dir) / "deltafetch") + assert instance.reset is False # project_data_dir mock with advanced settings - crawler.settings = Settings({'DELTAFETCH_ENABLED': True, - 'DELTAFETCH_DIR': 'other', - 'DELTAFETCH_RESET': True}) + crawler.settings = Settings( + { + "DELTAFETCH_ENABLED": True, + "DELTAFETCH_DIR": "other", + "DELTAFETCH_RESET": True, + } + ) instance = LegacyDeltaFetchSubClass.from_crawler(crawler) assert isinstance(instance, self.mwcls) - self.assertEqual( - instance.dir, os.path.join(self.temp_dir, 'other')) - self.assertEqual(instance.reset, True) + assert instance.dir == str(Path(self.temp_dir) / "other") + assert instance.reset is True def test_process_spider_output_stats_legacy(self): # testing the subclass not handling stats works at runtime # (i.e. that trying to update stats does not trigger exception) class LegacyDeltaFetchSubClass(self.mwcls): - def __init__(self, dir, reset=False, *args, **kwargs): - super(LegacyDeltaFetchSubClass, self).__init__(dir=dir, reset=reset) + super().__init__(dir=dir, reset=reset) self.something = True self._create_test_db() mw = LegacyDeltaFetchSubClass(self.temp_dir, reset=False) mw.spider_opened(self.spider) response = mock.Mock() - response.request = Request('http://url', - meta={'deltafetch_key': 'key'}) + response.request = Request("http://url", meta={"deltafetch_key": "key"}) result = [] - self.assertEqual(list(mw.process_spider_output( - response, result, self.spider)), []) - self.assertEqual(self.stats.get_stats(), {}) + assert list(mw.process_spider_output(response, result, self.spider)) == [] + assert self.stats.get_stats() == {} result = [ - Request('http://url', meta={'deltafetch_key': 'key'}), - Request('http://url1', meta={'deltafetch_key': 'test_key_1'}) + Request("http://url", meta={"deltafetch_key": "key"}), + Request("http://url1", meta={"deltafetch_key": "test_key_1"}), ] # stats should not be updated - self.assertEqual(list(mw.process_spider_output( - response, result, self.spider)), [result[0]]) - self.assertEqual(self.stats.get_value('deltafetch/skipped'), None) + assert list(mw.process_spider_output(response, result, self.spider)) == [ + result[0] + ] + assert self.stats.get_value("deltafetch/skipped") is None result = [Item(), "not a base item"] - self.assertEqual(list(mw.process_spider_output( - response, result, self.spider)), result) - self.assertEqual(self.stats.get_value('deltafetch/stored'), None) + assert list(mw.process_spider_output(response, result, self.spider)) == result + assert self.stats.get_value("deltafetch/stored") is None def test_get_key(self): mw = self.mwcls(self.temp_dir, reset=True) - test_req1 = Request('http://url1') - self.assertEqual(mw._get_key(test_req1), - to_bytes(request_fingerprint(test_req1))) - test_req2 = Request('http://url2', meta={'deltafetch_key': b'dfkey1'}) - self.assertEqual(mw._get_key(test_req2), b'dfkey1') + test_req1 = Request("http://url1") + assert mw._get_key(test_req1) == to_bytes(request_fingerprint(test_req1)) + test_req2 = Request("http://url2", meta={"deltafetch_key": b"dfkey1"}) + assert mw._get_key(test_req2) == b"dfkey1" - test_req3 = Request('http://url2', meta={'deltafetch_key': u'dfkey1'}) + test_req3 = Request("http://url2", meta={"deltafetch_key": "dfkey1"}) # key will be converted to bytes - self.assertEqual(mw._get_key(test_req3), b'dfkey1') + assert mw._get_key(test_req3) == b"dfkey1" def _create_test_db(self): # truncate test db if there were failed tests - db = dbm.open(self.db_path, 'n') - db[b'test_key_1'] = b'test_v_1' - db[b'test_key_2'] = b'test_v_2' - db.close() + with dbm.open(self.db_path, "n") as db: + db[b"test_key_1"] = b"test_v_1" + db[b"test_key_2"] = b"test_v_2" From 13fd8aeb3181074dffe4ac972e192ef6e5cb1f8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 26 Feb 2025 00:10:58 +0100 Subject: [PATCH 09/17] Update CI --- .github/workflows/main.yml | 42 +++++-------- pyproject.toml | 74 +++++++++++++++++++++++ requirements.txt | 1 - scrapy_deltafetch/middleware.py | 9 +-- tests/requirements-test.txt | 5 -- tests/test_deltafetch.py | 104 ++++++++++++++++++-------------- tox.ini | 50 +++++++++++++++ 7 files changed, 204 insertions(+), 81 deletions(-) delete mode 100644 requirements.txt delete mode 100644 tests/requirements-test.txt create mode 100644 tox.ini diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 1f079fd..14c5506 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,47 +1,37 @@ name: CI - -on: - push: - branches: [ master ] - pull_request: - branches: [ master ] - +on: [push, pull_request] jobs: - build: - + test: runs-on: ubuntu-latest strategy: fail-fast: false matrix: include: + - python-version: "3.9" + toxenv: min - python-version: "3.9" - python-version: "3.10" - python-version: "3.11" - python-version: "3.12" - python-version: "3.13" - + - python-version: "3.13" + toxenv: pre-commit + - python-version: "3.13" + toxenv: mypy + - python-version: "3.13" + toxenv: pylint + - python-version: "3.13" + toxenv: twinecheck steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: libddb run: | sudo apt-get install libdb-dev - - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - - name: Cache pip - uses: actions/cache@v2 - with: - path: ~/.cache/pip - key: ${{ runner.os}}-pip-${{ hashFiles('tests/requirements-test.txt') }} - restore-keys: | - ${{ runner.os}}-pip- - ${{ runner.os}}- - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r tests/requirements-test.txt - - name: Test with pytest + - name: Run run: | - pytest + pip install -U tox + tox -e ${{ matrix.toxenv }} diff --git a/pyproject.toml b/pyproject.toml index 5bc5e1f..06a3f79 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,74 @@ +[tool.coverage.run] +branch = true +include = ["scrapy_deltafetch/*"] +omit = ["tests/*"] +disable_warnings = ["include-ignored"] + +[tool.coverage.paths] +source = [ + "scrapy_deltafetch", + ".tox/**/site-packages/scrapy-deltafetch" +] + +[tool.coverage.report] +# https://github.com/nedbat/coveragepy/issues/831#issuecomment-517778185 +exclude_lines = ["pragma: no cover", "if TYPE_CHECKING:"] + + + +[tool.pylint.MASTER] +persistent = "no" +jobs = 1 # >1 hides results + +[tool.pylint."MESSAGES CONTROL"] +enable = [ + "useless-suppression", +] +disable = [ + # Ones we want to ignore + "attribute-defined-outside-init", + "broad-exception-caught", + "consider-using-with", + "cyclic-import", + "disallowed-name", + "duplicate-code", # https://github.com/pylint-dev/pylint/issues/214 + "fixme", + "import-outside-toplevel", + "inherit-non-class", # false positives with create_deprecated_class() + "invalid-name", + "invalid-overridden-method", + "isinstance-second-argument-not-valid-type", # false positives with create_deprecated_class() + "line-too-long", + "logging-format-interpolation", + "logging-fstring-interpolation", + "logging-not-lazy", + "missing-docstring", + "no-member", + "no-name-in-module", # caught by mypy already + "no-value-for-parameter", # https://github.com/pylint-dev/pylint/issues/3268 + "not-callable", + "protected-access", + "redefined-builtin", + "redefined-outer-name", + "too-few-public-methods", + "too-many-ancestors", + "too-many-arguments", + "too-many-branches", + "too-many-function-args", + "too-many-instance-attributes", + "too-many-lines", + "too-many-locals", + "too-many-positional-arguments", + "too-many-public-methods", + "too-many-return-statements", + "unused-argument", + "unused-import", + "unused-variable", + "useless-import-alias", # used as a hint to mypy + "useless-return", # https://github.com/pylint-dev/pylint/issues/6530 + "wrong-import-position", +] + [tool.ruff.lint] extend-select = [ # flake8-bugbear @@ -103,3 +174,6 @@ ignore = [ [tool.ruff.lint.per-file-ignores] # D102: Missing docstring in public method "tests/**" = ["D102"] + +[tool.ruff.lint.pydocstyle] +convention = "pep257" diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 0b9465e..0000000 --- a/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -scrapy>=1.1.0 diff --git a/scrapy_deltafetch/middleware.py b/scrapy_deltafetch/middleware.py index 471919e..bd686f7 100644 --- a/scrapy_deltafetch/middleware.py +++ b/scrapy_deltafetch/middleware.py @@ -6,7 +6,6 @@ from scrapy import signals from scrapy.exceptions import NotConfigured from scrapy.http import Request -from scrapy.item import Item from scrapy.utils.project import data_path from scrapy.utils.python import to_bytes @@ -36,6 +35,8 @@ def from_crawler(cls, crawler): # noqa: D102 dir = data_path(s.get("DELTAFETCH_DIR", "deltafetch")) reset = s.getbool("DELTAFETCH_RESET") o = cls(dir, reset, crawler.stats) + if o.stats is None: + o.stats = crawler.stats crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) @@ -56,14 +57,14 @@ def spider_opened(self, spider): # noqa: D102 reset = self.reset or getattr(spider, "deltafetch_reset", False) flag = "n" if reset else "c" try: - self.db = dbm.open(dbpath, flag=flag) # noqa: SIM115 + self.db = dbm.open(str(dbpath), flag=flag) # noqa: SIM115 except Exception: logger.warning( f"Failed to open DeltaFetch database at {dbpath}, trying to recreate it" ) if dbpath.exists(): dbpath.unlink() - self.db = dbm.open(dbpath, "c") # noqa: SIM115 + self.db = dbm.open(str(dbpath), "c") # noqa: SIM115 def spider_closed(self, spider): # noqa: D102 self.db.close() @@ -77,7 +78,7 @@ def process_spider_output(self, response, result, spider): # noqa: D102 if self.stats: self.stats.inc_value("deltafetch/skipped", spider=spider) continue - elif isinstance(r, (Item, dict)): + else: key = self._get_key(response.request) self.db[key] = str(time.time()) if self.stats: diff --git a/tests/requirements-test.txt b/tests/requirements-test.txt deleted file mode 100644 index bf2c733..0000000 --- a/tests/requirements-test.txt +++ /dev/null @@ -1,5 +0,0 @@ --r ../requirements.txt -mock -pytest -pytest-benchmark - diff --git a/tests/test_deltafetch.py b/tests/test_deltafetch.py index 6b049d7..6e42d58 100644 --- a/tests/test_deltafetch.py +++ b/tests/test_deltafetch.py @@ -1,5 +1,6 @@ import dbm import tempfile +from dataclasses import dataclass from pathlib import Path from unittest import TestCase, mock @@ -9,19 +10,9 @@ from scrapy.item import Item from scrapy.settings import Settings from scrapy.spiders import Spider -from scrapy.statscollectors import StatsCollector from scrapy.utils.python import to_bytes from scrapy.utils.test import get_crawler -try: - from scrapy.utils.request import request_fingerprint - - _legacy_fingerprint = True -except ImportError: - from scrapy.utils.request import RequestFingerprinter - - _legacy_fingerprint = False - from scrapy_deltafetch.middleware import DeltaFetch @@ -36,15 +27,23 @@ def setUp(self): self.temp_dir = Path(tempfile.gettempdir()) self.db_path = self.temp_dir / f"{self.spider.name}.db" - crawler = get_crawler(Spider) - self.stats = StatsCollector(crawler) + def get_mw(self, dir=None, reset=None, cls=DeltaFetch): + settings = { + "DELTAFETCH_ENABLED": True, + } + if dir is not None: + settings["DELTAFETCH_DIR"] = dir + if reset is not None: + settings["DELTAFETCH_RESET"] = reset + crawler = get_crawler(Spider, settings_dict=settings) + return cls.from_crawler(crawler) def test_init(self): # path format is any, the folder is not created - instance = self.mwcls("/any/dir", True, stats=self.stats) + instance = self.get_mw("/any/dir", reset=True) assert isinstance(instance, self.mwcls) assert instance.dir == "/any/dir" - assert self.stats.get_stats() == {} + assert instance.stats.get_stats() == {} assert instance.reset is True def test_init_from_crawler(self): @@ -84,7 +83,7 @@ def test_spider_opened_new(self): """Middleware should create a .db file if not found.""" if self.db_path.exists(): self.db_path.unlink() - mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) + mw = self.get_mw(dir=self.temp_dir, reset=False) assert not hasattr(self.mwcls, "db") mw.spider_opened(self.spider) assert self.temp_dir.is_dir() @@ -95,7 +94,7 @@ def test_spider_opened_new(self): def test_spider_opened_existing(self): """Middleware should open and use existing and valid .db files.""" self._create_test_db() - mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) + mw = self.get_mw(dir=self.temp_dir, reset=False) assert not hasattr(self.mwcls, "db") mw.spider_opened(self.spider) assert hasattr(mw, "db") @@ -107,7 +106,7 @@ def test_spider_opened_corrupt_dbfile(self): # create an invalid .db file with self.db_path.open("wb") as dbfile: dbfile.write(b"bad") - mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) + mw = self.get_mw(dir=self.temp_dir, reset=False) assert not hasattr(self.mwcls, "db") # file corruption is only detected when opening spider @@ -121,14 +120,14 @@ def test_spider_opened_corrupt_dbfile(self): def test_spider_opened_existing_spider_reset(self): self._create_test_db() - mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) + mw = self.get_mw(self.temp_dir, reset=False) assert not hasattr(self.mwcls, "db") self.spider.deltafetch_reset = True mw.spider_opened(self.spider) assert mw.db.keys() == [] def test_spider_opened_reset_non_existing_db(self): - mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats) + mw = self.get_mw(dir=self.temp_dir, reset=True) assert not hasattr(self.mwcls, "db") self.spider.deltafetch_reset = True mw.spider_opened(self.spider) @@ -136,7 +135,7 @@ def test_spider_opened_reset_non_existing_db(self): def test_spider_opened_recreate(self): self._create_test_db() - mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats) + mw = self.get_mw(dir=self.temp_dir, reset=True) assert not hasattr(self.mwcls, "db") mw.spider_opened(self.spider) assert hasattr(mw, "db") @@ -144,7 +143,7 @@ def test_spider_opened_recreate(self): def test_spider_closed(self): self._create_test_db() - mw = self.mwcls(self.temp_dir, reset=True, stats=self.stats) + mw = self.get_mw(dir=self.temp_dir, reset=True) mw.spider_opened(self.spider) assert mw.db.get("random") is None mw.spider_closed(self.spider) @@ -153,12 +152,17 @@ def test_spider_closed(self): def test_process_spider_output(self): self._create_test_db() - mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) + settings = { + "DELTAFETCH_DIR": self.temp_dir, + "DELTAFETCH_ENABLED": True, + } + crawler = get_crawler(Spider, settings_dict=settings) + mw = self.mwcls.from_crawler(crawler) mw.spider_opened(self.spider) response = mock.Mock() response.request = Request("http://url", meta={"deltafetch_key": "key"}) result = [] - assert list(mw.process_spider_output(response, result, self.spider)) == [] + assert not list(mw.process_spider_output(response, result, self.spider)) result = [ # same URL but with new key --> it should be processed Request("http://url", meta={"deltafetch_key": "key1"}), @@ -171,7 +175,7 @@ def test_process_spider_output(self): ] # the skipped "http://url1" should be counted in stats - assert self.stats.get_stats() == {"deltafetch/skipped": 1} + assert crawler.stats.get_stats() == {"deltafetch/skipped": 1} # b'key' should not be in the db yet as no item was collected yet assert set(mw.db.keys()) == {b"test_key_1", b"test_key_2"} @@ -194,7 +198,7 @@ def test_process_spider_output_with_ignored_request(self): response = mock.Mock() response.request = Request("http://url") result = [] - assert list(mw.process_spider_output(response, result, self.spider)) == [] + assert not list(mw.process_spider_output(response, result, self.spider)) result = [ Request("http://url1"), # 'url1' is already in the db, but deltafetch_enabled=False @@ -209,7 +213,7 @@ def test_process_spider_output_with_ignored_request(self): def test_process_spider_output_dict(self): self._create_test_db() - mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) + mw = self.get_mw(dir=self.temp_dir, reset=False) mw.spider_opened(self.spider) response = mock.Mock() response.request = Request("http://url", meta={"deltafetch_key": "key"}) @@ -220,13 +224,13 @@ def test_process_spider_output_dict(self): def test_process_spider_output_stats(self): self._create_test_db() - mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats) + mw = self.get_mw(dir=self.temp_dir) mw.spider_opened(self.spider) response = mock.Mock() response.request = Request("http://url", meta={"deltafetch_key": "key"}) result = [] - assert list(mw.process_spider_output(response, result, self.spider)) == [] - assert self.stats.get_stats() == {} + assert not list(mw.process_spider_output(response, result, self.spider)) + assert mw.stats.get_stats() == {} result = [ Request("http://url", meta={"deltafetch_key": "key"}), Request("http://url1", meta={"deltafetch_key": "test_key_1"}), @@ -234,15 +238,20 @@ def test_process_spider_output_stats(self): assert list(mw.process_spider_output(response, result, self.spider)) == [ result[0] ] - assert self.stats.get_value("deltafetch/skipped") == 1 - result = [Item(), "not a base item"] + assert mw.stats.get_value("deltafetch/skipped") == 1 + + @dataclass + class TestItem: + foo: str + + result = [Item(), TestItem("bar")] assert list(mw.process_spider_output(response, result, self.spider)) == result - assert self.stats.get_value("deltafetch/stored") == 1 + assert mw.stats.get_value("deltafetch/stored") == 2 def test_init_from_crawler_legacy(self): # test with subclass not handling passed stats class LegacyDeltaFetchSubClass(self.mwcls): - def __init__(self, dir, reset=False, *args, **kwargs): + def __init__(self, dir, reset, *args, **kwargs): super().__init__(dir=dir, reset=reset) self.something = True @@ -283,32 +292,35 @@ def test_process_spider_output_stats_legacy(self): # testing the subclass not handling stats works at runtime # (i.e. that trying to update stats does not trigger exception) class LegacyDeltaFetchSubClass(self.mwcls): - def __init__(self, dir, reset=False, *args, **kwargs): + def __init__(self, dir, *args, reset=False, **kwargs): super().__init__(dir=dir, reset=reset) self.something = True self._create_test_db() - mw = LegacyDeltaFetchSubClass(self.temp_dir, reset=False) + mw = self.get_mw(dir=self.temp_dir, reset=False, cls=LegacyDeltaFetchSubClass) mw.spider_opened(self.spider) response = mock.Mock() response.request = Request("http://url", meta={"deltafetch_key": "key"}) result = [] - assert list(mw.process_spider_output(response, result, self.spider)) == [] - assert self.stats.get_stats() == {} + assert not list(mw.process_spider_output(response, result, self.spider)) + assert mw.stats.get_stats() == {} result = [ Request("http://url", meta={"deltafetch_key": "key"}), Request("http://url1", meta={"deltafetch_key": "test_key_1"}), ] - # stats should not be updated assert list(mw.process_spider_output(response, result, self.spider)) == [ result[0] ] - assert self.stats.get_value("deltafetch/skipped") is None + assert mw.stats.get_value("deltafetch/skipped") == 1 - result = [Item(), "not a base item"] + @dataclass + class TestItem: + foo: str + + result = [Item(), TestItem("bar")] assert list(mw.process_spider_output(response, result, self.spider)) == result - assert self.stats.get_value("deltafetch/stored") is None + assert mw.stats.get_value("deltafetch/stored") == 2 def test_get_key(self): settings = { @@ -319,10 +331,12 @@ def test_get_key(self): crawler = get_crawler(Spider, settings_dict=settings) mw = self.mwcls.from_crawler(crawler) test_req1 = Request("http://url1") - if _legacy_fingerprint: + try: + fingerprint = crawler.request_fingerprinter.fingerprint + except AttributeError: # Scrapy < 2.7.0 + from scrapy.utils.request import request_fingerprint + fingerprint = request_fingerprint - else: - fingerprint = RequestFingerprinter.from_crawler(crawler).fingerprint assert mw._get_key(test_req1) == to_bytes(fingerprint(test_req1)) test_req2 = Request("http://url2", meta={"deltafetch_key": b"dfkey1"}) assert mw._get_key(test_req2) == b"dfkey1" @@ -333,6 +347,6 @@ def test_get_key(self): def _create_test_db(self): # truncate test db if there were failed tests - with dbm.open(self.db_path, "n") as db: + with dbm.open(str(self.db_path), "n") as db: db[b"test_key_1"] = b"test_v_1" db[b"test_key_2"] = b"test_v_2" diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..6c4228c --- /dev/null +++ b/tox.ini @@ -0,0 +1,50 @@ +[tox] +envlist = pre-commit,mypy,pylint,twinecheck,min,py39,py310,py311,py312,py313 + +[testenv] +deps = + pytest + pytest-cov + pytest-benchmark +commands = + pytest \ + --cov=scrapy_deltafetch \ + --cov-config=pyproject.toml \ + --cov-report=xml \ + --cov-report= \ + {posargs:scrapy_deltafetch tests} + +[testenv:min] +basepython = python3.9 +deps = + {[testenv]deps} + scrapy==1.1.0 + +[testenv:pre-commit] +deps = + pre-commit +commands = + pre-commit run {posargs:--all-files} + +[testenv:mypy] +deps = + {[testenv]deps} + mypy==1.15.0 +commands = + mypy {posargs:scrapy_deltafetch tests} + +# https://github.com/astral-sh/ruff/issues/970 +[testenv:pylint] +deps = + {[testenv]deps} + pylint==3.3.4 +commands = + pylint {posargs:scrapy_deltafetch tests} + +[testenv:twinecheck] +deps = + twine==6.1.0 + build==1.2.2.post1 +commands = + python -m build --sdist + twine check dist/* From 8af006a8112ea939523783044d951aef7fe07080 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 26 Feb 2025 00:17:04 +0100 Subject: [PATCH 10/17] CI: fix TOXENV definition --- .github/workflows/main.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 14c5506..06851ac 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -32,6 +32,8 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Run + env: + TOXENV: ${{ matrix.toxenv }} run: | pip install -U tox - tox -e ${{ matrix.toxenv }} + tox From 33ba8a23a9bd63bbf57412e97c63724e125fb131 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 26 Feb 2025 00:19:42 +0100 Subject: [PATCH 11/17] CI: fix duplicate jobs --- .github/workflows/main.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 06851ac..896f9fe 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,5 +1,8 @@ name: CI -on: [push, pull_request] +on: + push: + branches: [ master ] + pull_request: jobs: test: runs-on: ubuntu-latest From e6162071fb1bd344231c7ee2c736d92f06e7d0d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 26 Feb 2025 00:35:14 +0100 Subject: [PATCH 12/17] =?UTF-8?q?setup.py=20=E2=86=92=20pyproject.toml?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 34 ++++++++++++++++++++++++++++++++-- setup.cfg | 2 -- setup.py | 30 ------------------------------ 3 files changed, 32 insertions(+), 34 deletions(-) delete mode 100644 setup.cfg delete mode 100644 setup.py diff --git a/pyproject.toml b/pyproject.toml index 06a3f79..d43504c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,32 @@ +[build-system] +requires = ["setuptools>=61.2"] +build-backend = "setuptools.build_meta" + +[project] +name = "scrapy-deltafetch" +version = "2.0.1" +authors = [{name = "Zyte", email = "opensource@zyte.com"}] +license = {text = "BSD"} +description = "Scrapy middleware to ignore previously crawled pages" +readme = "README.rst" +classifiers = [ + "Development Status :: 4 - Beta", + "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", +] +requires-python = ">=3.9" +dependencies = ["Scrapy>=1.1.0"] + +[project.urls] +Homepage = "http://github.com/scrapy-plugins/scrapy-deltafetch" + [tool.coverage.run] branch = true include = ["scrapy_deltafetch/*"] @@ -14,8 +43,6 @@ source = [ # https://github.com/nedbat/coveragepy/issues/831#issuecomment-517778185 exclude_lines = ["pragma: no cover", "if TYPE_CHECKING:"] - - [tool.pylint.MASTER] persistent = "no" jobs = 1 # >1 hides results @@ -177,3 +204,6 @@ ignore = [ [tool.ruff.lint.pydocstyle] convention = "pep257" + +[tool.setuptools] +packages = ["scrapy_deltafetch"] diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 3c6e79c..0000000 --- a/setup.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[bdist_wheel] -universal=1 diff --git a/setup.py b/setup.py deleted file mode 100644 index c3eed32..0000000 --- a/setup.py +++ /dev/null @@ -1,30 +0,0 @@ -from pathlib import Path - -from setuptools import setup - -setup( - name="scrapy-deltafetch", - version="2.0.1", - license="BSD", - description="Scrapy middleware to ignore previously crawled pages", - long_description=Path("README.rst").read_text(encoding="utf-8"), - author="Zyte", - author_email="opensource@zyte.com", - url="http://github.com/scrapy-plugins/scrapy-deltafetch", - packages=["scrapy_deltafetch"], - platforms=["Any"], - classifiers=[ - "Development Status :: 4 - Beta", - "License :: OSI Approved :: BSD License", - "Operating System :: OS Independent", - "Programming Language :: Python", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Programming Language :: Python :: 3.13", - ], - install_requires=["Scrapy>=1.1.0"], - python_requires=">=3.9", -) From 61194546940812e390c178e2ad5f27df672756bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 26 Feb 2025 00:53:26 +0100 Subject: [PATCH 13/17] =?UTF-8?q?New=20CHANGES.rst=20entry,=20.bumpversion?= =?UTF-8?q?.cfg=20=E2=86=92=20pyproject.toml?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 8 -------- CHANGES.rst | 10 ++++++++++ pyproject.toml | 18 ++++++++++++++++++ 3 files changed, 28 insertions(+), 8 deletions(-) delete mode 100644 .bumpversion.cfg diff --git a/.bumpversion.cfg b/.bumpversion.cfg deleted file mode 100644 index 1ca6818..0000000 --- a/.bumpversion.cfg +++ /dev/null @@ -1,8 +0,0 @@ -[bumpversion] -current_version = 2.0.1 -commit = True -tag = True - -[bumpversion:file:setup.py] - -[bumpversion:file:scrapy_deltafetch/__init__.py] diff --git a/CHANGES.rst b/CHANGES.rst index 31e1e5f..6c59cfc 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,5 +1,15 @@ Changes ======= + +2.1.0 (unreleased) +------------------ + +* Drop support for Python 3.8 and lower, add support for Python 3.9 and higher. +* Add support for Scrapy 2.12. +* Use the ``REQUEST_FINGERPRINTER_CLASS`` setting introduced in Scrapy 2.7. +* Support new item types introduced in Scrapy 2.2. +* Support ``Path`` instances in the ``DELTAFETCH_DIR`` setting. + 2.0.0 (2021-09-20) ------------------ * drop Python 2 support diff --git a/pyproject.toml b/pyproject.toml index d43504c..32329bf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,24 @@ dependencies = ["Scrapy>=1.1.0"] [project.urls] Homepage = "http://github.com/scrapy-plugins/scrapy-deltafetch" +[tool.bumpversion] +current_version = "2.0.1" +commit = true +tag = true + +[[tool.bumpversion.files]] +filename = 'CHANGES.rst' +search = "\\(unreleased\\)$" +replace = "({now:%Y-%m-%d})" +regex = true + +[[tool.bumpversion.files]] +search = "version = \"{current_version}\"" +filename = "pyproject.toml" + +[[tool.bumpversion.files]] +filename = "scrapy_deltafetch/__init__.py" + [tool.coverage.run] branch = true include = ["scrapy_deltafetch/*"] From ff74f039e164bf1faa59be0ac48ba0e8e20bbea3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 26 Feb 2025 00:54:58 +0100 Subject: [PATCH 14/17] Fix bumpversion config --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 32329bf..1db8257 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,7 @@ regex = true [[tool.bumpversion.files]] search = "version = \"{current_version}\"" +replace = "version = \"{new_version}\"" filename = "pyproject.toml" [[tool.bumpversion.files]] From d30081680c52d308e3bb7c2747642fe7c1950372 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 26 Feb 2025 00:57:30 +0100 Subject: [PATCH 15/17] Remove .coveragerc, already in pyproject.toml --- .coveragerc | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 .coveragerc diff --git a/.coveragerc b/.coveragerc deleted file mode 100644 index ec1040c..0000000 --- a/.coveragerc +++ /dev/null @@ -1,3 +0,0 @@ -[run] -branch = true -source = scrapy_deltafetch From 312afcc718176eb4a8455f3f1604920068166f5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 26 Feb 2025 00:58:33 +0100 Subject: [PATCH 16/17] Update the publish workflow --- .github/workflows/publish.yml | 42 ++++++++++++++++------------------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 175b20f..8d61dc6 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -1,27 +1,23 @@ -name: Upload Python Package - +name: Publish on: - release: - types: [created] - + push: + tags: + - '[0-9]+.[0-9]+.[0-9]+' jobs: - deploy: + publish: runs-on: ubuntu-latest - + environment: + name: pypi + url: https://pypi.org/p/${{ github.event.repository.name }} + permissions: + id-token: write steps: - - uses: actions/checkout@v2 - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: "3.x" - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install setuptools wheel twine - - name: Build and publish - env: - TWINE_USERNAME: __token__ - TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} - run: | - python setup.py sdist bdist_wheel - twine upload dist/* \ No newline at end of file + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.13 + - run: | + python -m pip install --upgrade build + python -m build + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 From 7f0211cef5340f51a70524c778199156d05d92b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 26 Feb 2025 21:28:35 +0100 Subject: [PATCH 17/17] CI: add CodeCov --- .github/workflows/main.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 896f9fe..fd707ad 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -40,3 +40,5 @@ jobs: run: | pip install -U tox tox + - name: Upload coverage report + uses: codecov/codecov-action@v5