From 30c1e9771370280f985298d160aa90d12f6a8c7e Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 14 Feb 2025 16:16:42 +0100 Subject: [PATCH 01/18] feat(scrapy): add Scrapy cache using Apify KV store This code has been originally developed for the https://github.com/juniorguru/plucker/ project, which is licensed under AGPL-3.0-only. I am the sole author of that code and hereby I grant the https://github.com/apify/apify-sdk-python/ project the right to use it under an Apache-2.0 license, without the copyleft taking effect. My intention is to contribute this code to the upstream, remove it from my project, and only import the component from the apify package as a dependency, as I believe this component could be useful to other users of the apify package. --- src/apify/scrapy/cache.py | 168 ++++++++++++++++++++++++++++++++ tests/unit/scrapy/test_cache.py | 35 +++++++ 2 files changed, 203 insertions(+) create mode 100644 src/apify/scrapy/cache.py create mode 100644 tests/unit/scrapy/test_cache.py diff --git a/src/apify/scrapy/cache.py b/src/apify/scrapy/cache.py new file mode 100644 index 00000000..e212ef63 --- /dev/null +++ b/src/apify/scrapy/cache.py @@ -0,0 +1,168 @@ +from __future__ import annotations + +import gzip +import io +import pickle +import struct +from logging import getLogger +from time import time +from typing import TYPE_CHECKING + +from scrapy.http.headers import Headers +from scrapy.responsetypes import responsetypes + +from ._async_thread import AsyncThread +from apify import Configuration +from apify.apify_storage_client import ApifyStorageClient +from apify.storages import KeyValueStore + +if TYPE_CHECKING: + from scrapy import Request, Spider + from scrapy.http.response import Response + from scrapy.settings import BaseSettings + from scrapy.utils.request import RequestFingerprinterProtocol + +logger = getLogger(__name__) + + +class ApifyCacheStorage: + """A Scrapy cache storage that uses the Apify `KeyValueStore` to store responses. + + This cache storage requires the asyncio Twisted reactor to be installed. + """ + + def __init__(self, settings: BaseSettings) -> None: + self.expiration_max_items = 100 + self.expiration_secs: int = settings.getint("HTTPCACHE_EXPIRATION_SECS") + self.spider: Spider | None = None + self._kv: KeyValueStore | None = None + self._fingerprinter: RequestFingerprinterProtocol | None = None + self._async_thread: AsyncThread | None = None + + def open_spider(self, spider: Spider) -> None: + logger.debug("Using Apify key value cache storage", extra={"spider": spider}) + self.spider = spider + self._fingerprinter = spider.crawler.request_fingerprinter + kv_name = f"httpcache-{spider.name}" + + async def open_kv() -> KeyValueStore: + config = Configuration.get_global_configuration() + if config.is_at_home: + storage_client = ApifyStorageClient.from_config(config) + return await KeyValueStore.open( + name=kv_name, storage_client=storage_client + ) + return await KeyValueStore.open(name=kv_name) + + logger.debug("Starting background thread for cache storage's event loop") + self._async_thread = AsyncThread() + logger.debug(f"Opening cache storage's {kv_name!r} key value store") + self._kv = self._async_thread.run_coro(open_kv()) + + def close_spider(self, spider: Spider, current_time: int | None = None) -> None: + assert self._async_thread is not None, "Async thread not initialized" + + logger.info(f"Cleaning up cache items (max {self.expiration_max_items})") + if 0 < self.expiration_secs: + if current_time is None: + current_time = int(time()) + + async def expire_kv() -> None: + assert self._kv is not None, "Key value store not initialized" + i = 0 + async for item in self._kv.iterate_keys(): + value = await self._kv.get_value(item.key) + try: + gzip_time = read_gzip_time(value) + except Exception as e: + logger.warning(f"Malformed cache item {item.key}: {e}") + await self._kv.set_value(item.key, None) + else: + if self.expiration_secs < current_time - gzip_time: + logger.debug(f"Expired cache item {item.key}") + await self._kv.set_value(item.key, None) + else: + logger.debug(f"Valid cache item {item.key}") + if i == self.expiration_max_items: + break + i += 1 + + self._async_thread.run_coro(expire_kv()) + + logger.debug("Closing cache storage") + try: + self._async_thread.close() + except KeyboardInterrupt: + logger.warning("Shutdown interrupted by KeyboardInterrupt!") + except Exception: + logger.exception("Exception occurred while shutting down cache storage") + finally: + logger.debug("Cache storage closed") + + def retrieve_response( + self, spider: Spider, request: Request, current_time: int | None = None + ) -> Response | None: + assert self._async_thread is not None, "Async thread not initialized" + assert self._kv is not None, "Key value store not initialized" + assert self._fingerprinter is not None, "Request fingerprinter not initialized" + + key = self._fingerprinter.fingerprint(request).hex() + value = self._async_thread.run_coro(self._kv.get_value(key)) + + if value is None: + logger.debug("Cache miss", extra={"request": request}) + return None + + if current_time is None: + current_time = int(time()) + if 0 < self.expiration_secs < current_time - read_gzip_time(value): + logger.debug("Cache expired", extra={"request": request}) + return None + + data = from_gzip(value) + url = data["url"] + status = data["status"] + headers = Headers(data["headers"]) + body = data["body"] + respcls = responsetypes.from_args(headers=headers, url=url, body=body) + + logger.debug("Cache hit", extra={"request": request}) + return respcls(url=url, headers=headers, status=status, body=body) + + def store_response( + self, spider: Spider, request: Request, response: Response + ) -> None: + assert self._async_thread is not None, "Async thread not initialized" + assert self._kv is not None, "Key value store not initialized" + assert self._fingerprinter is not None, "Request fingerprinter not initialized" + + key = self._fingerprinter.fingerprint(request).hex() + data = { + "status": response.status, + "url": response.url, + "headers": dict(response.headers), + "body": response.body, + } + value = to_gzip(data) + self._async_thread.run_coro(self._kv.set_value(key, value)) + + +def to_gzip(data: dict, mtime: int | None = None) -> bytes: + """Dump a dictionary to a gzip-compressed byte stream.""" + with io.BytesIO() as byte_stream: + with gzip.GzipFile(fileobj=byte_stream, mode="wb", mtime=mtime) as gzip_file: + pickle.dump(data, gzip_file, protocol=4) + return byte_stream.getvalue() + + +def from_gzip(gzip_bytes: bytes) -> dict: + """Load a dictionary from a gzip-compressed byte stream.""" + with io.BytesIO(gzip_bytes) as byte_stream, gzip.GzipFile(fileobj=byte_stream, mode="rb") as gzip_file: + return pickle.load(gzip_file) + + +def read_gzip_time(gzip_bytes: bytes) -> int: + """Read the modification time from a gzip-compressed byte stream without decompressing the data.""" + header = gzip_bytes[:10] + header_components = struct.unpack(" None: + assert from_gzip(to_gzip({"name": "Alice"})) == {"name": "Alice"} + + +def test_to_gzip() -> None: + data_bytes = to_gzip({"name": "Alice"}, mtime=0) + + assert data_bytes == FIXTURE_BYTES + + +def test_from_gzip() -> None: + data_dict = from_gzip(FIXTURE_BYTES) + + assert data_dict == {"name": "Alice"} + + +def test_read_gzip_time() -> None: + assert read_gzip_time(FIXTURE_BYTES) == 0 + + +def test_read_gzip_time_non_zero() -> None: + current_time = int(time()) + data_bytes = to_gzip({"name": "Alice"}, mtime=current_time) + + assert read_gzip_time(data_bytes) == current_time From 96f1142c0a6c0fad2185f554ba5366518d84331e Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 14 Feb 2025 16:24:45 +0100 Subject: [PATCH 02/18] style: format code --- src/apify/scrapy/cache.py | 78 +++++++++++++++------------------ tests/unit/scrapy/test_cache.py | 12 ++--- 2 files changed, 42 insertions(+), 48 deletions(-) diff --git a/src/apify/scrapy/cache.py b/src/apify/scrapy/cache.py index e212ef63..b4f3a417 100644 --- a/src/apify/scrapy/cache.py +++ b/src/apify/scrapy/cache.py @@ -33,25 +33,23 @@ class ApifyCacheStorage: def __init__(self, settings: BaseSettings) -> None: self.expiration_max_items = 100 - self.expiration_secs: int = settings.getint("HTTPCACHE_EXPIRATION_SECS") + self.expiration_secs: int = settings.getint('HTTPCACHE_EXPIRATION_SECS') self.spider: Spider | None = None self._kv: KeyValueStore | None = None self._fingerprinter: RequestFingerprinterProtocol | None = None self._async_thread: AsyncThread | None = None def open_spider(self, spider: Spider) -> None: - logger.debug("Using Apify key value cache storage", extra={"spider": spider}) + logger.debug('Using Apify key value cache storage', extra={'spider': spider}) self.spider = spider self._fingerprinter = spider.crawler.request_fingerprinter - kv_name = f"httpcache-{spider.name}" + kv_name = f'httpcache-{spider.name}' async def open_kv() -> KeyValueStore: config = Configuration.get_global_configuration() if config.is_at_home: storage_client = ApifyStorageClient.from_config(config) - return await KeyValueStore.open( - name=kv_name, storage_client=storage_client - ) + return await KeyValueStore.open(name=kv_name, storage_client=storage_client) return await KeyValueStore.open(name=kv_name) logger.debug("Starting background thread for cache storage's event loop") @@ -60,88 +58,84 @@ async def open_kv() -> KeyValueStore: self._kv = self._async_thread.run_coro(open_kv()) def close_spider(self, spider: Spider, current_time: int | None = None) -> None: - assert self._async_thread is not None, "Async thread not initialized" + assert self._async_thread is not None, 'Async thread not initialized' - logger.info(f"Cleaning up cache items (max {self.expiration_max_items})") + logger.info(f'Cleaning up cache items (max {self.expiration_max_items})') if 0 < self.expiration_secs: if current_time is None: current_time = int(time()) async def expire_kv() -> None: - assert self._kv is not None, "Key value store not initialized" + assert self._kv is not None, 'Key value store not initialized' i = 0 async for item in self._kv.iterate_keys(): value = await self._kv.get_value(item.key) try: gzip_time = read_gzip_time(value) except Exception as e: - logger.warning(f"Malformed cache item {item.key}: {e}") + logger.warning(f'Malformed cache item {item.key}: {e}') await self._kv.set_value(item.key, None) else: if self.expiration_secs < current_time - gzip_time: - logger.debug(f"Expired cache item {item.key}") + logger.debug(f'Expired cache item {item.key}') await self._kv.set_value(item.key, None) else: - logger.debug(f"Valid cache item {item.key}") + logger.debug(f'Valid cache item {item.key}') if i == self.expiration_max_items: break i += 1 self._async_thread.run_coro(expire_kv()) - logger.debug("Closing cache storage") + logger.debug('Closing cache storage') try: self._async_thread.close() except KeyboardInterrupt: - logger.warning("Shutdown interrupted by KeyboardInterrupt!") + logger.warning('Shutdown interrupted by KeyboardInterrupt!') except Exception: - logger.exception("Exception occurred while shutting down cache storage") + logger.exception('Exception occurred while shutting down cache storage') finally: - logger.debug("Cache storage closed") + logger.debug('Cache storage closed') - def retrieve_response( - self, spider: Spider, request: Request, current_time: int | None = None - ) -> Response | None: - assert self._async_thread is not None, "Async thread not initialized" - assert self._kv is not None, "Key value store not initialized" - assert self._fingerprinter is not None, "Request fingerprinter not initialized" + def retrieve_response(self, spider: Spider, request: Request, current_time: int | None = None) -> Response | None: + assert self._async_thread is not None, 'Async thread not initialized' + assert self._kv is not None, 'Key value store not initialized' + assert self._fingerprinter is not None, 'Request fingerprinter not initialized' key = self._fingerprinter.fingerprint(request).hex() value = self._async_thread.run_coro(self._kv.get_value(key)) if value is None: - logger.debug("Cache miss", extra={"request": request}) + logger.debug('Cache miss', extra={'request': request}) return None if current_time is None: current_time = int(time()) if 0 < self.expiration_secs < current_time - read_gzip_time(value): - logger.debug("Cache expired", extra={"request": request}) + logger.debug('Cache expired', extra={'request': request}) return None data = from_gzip(value) - url = data["url"] - status = data["status"] - headers = Headers(data["headers"]) - body = data["body"] + url = data['url'] + status = data['status'] + headers = Headers(data['headers']) + body = data['body'] respcls = responsetypes.from_args(headers=headers, url=url, body=body) - logger.debug("Cache hit", extra={"request": request}) + logger.debug('Cache hit', extra={'request': request}) return respcls(url=url, headers=headers, status=status, body=body) - def store_response( - self, spider: Spider, request: Request, response: Response - ) -> None: - assert self._async_thread is not None, "Async thread not initialized" - assert self._kv is not None, "Key value store not initialized" - assert self._fingerprinter is not None, "Request fingerprinter not initialized" + def store_response(self, spider: Spider, request: Request, response: Response) -> None: + assert self._async_thread is not None, 'Async thread not initialized' + assert self._kv is not None, 'Key value store not initialized' + assert self._fingerprinter is not None, 'Request fingerprinter not initialized' key = self._fingerprinter.fingerprint(request).hex() data = { - "status": response.status, - "url": response.url, - "headers": dict(response.headers), - "body": response.body, + 'status': response.status, + 'url': response.url, + 'headers': dict(response.headers), + 'body': response.body, } value = to_gzip(data) self._async_thread.run_coro(self._kv.set_value(key, value)) @@ -150,19 +144,19 @@ def store_response( def to_gzip(data: dict, mtime: int | None = None) -> bytes: """Dump a dictionary to a gzip-compressed byte stream.""" with io.BytesIO() as byte_stream: - with gzip.GzipFile(fileobj=byte_stream, mode="wb", mtime=mtime) as gzip_file: + with gzip.GzipFile(fileobj=byte_stream, mode='wb', mtime=mtime) as gzip_file: pickle.dump(data, gzip_file, protocol=4) return byte_stream.getvalue() def from_gzip(gzip_bytes: bytes) -> dict: """Load a dictionary from a gzip-compressed byte stream.""" - with io.BytesIO(gzip_bytes) as byte_stream, gzip.GzipFile(fileobj=byte_stream, mode="rb") as gzip_file: + with io.BytesIO(gzip_bytes) as byte_stream, gzip.GzipFile(fileobj=byte_stream, mode='rb') as gzip_file: return pickle.load(gzip_file) def read_gzip_time(gzip_bytes: bytes) -> int: """Read the modification time from a gzip-compressed byte stream without decompressing the data.""" header = gzip_bytes[:10] - header_components = struct.unpack(" None: - assert from_gzip(to_gzip({"name": "Alice"})) == {"name": "Alice"} + assert from_gzip(to_gzip({'name': 'Alice'})) == {'name': 'Alice'} def test_to_gzip() -> None: - data_bytes = to_gzip({"name": "Alice"}, mtime=0) + data_bytes = to_gzip({'name': 'Alice'}, mtime=0) assert data_bytes == FIXTURE_BYTES @@ -21,7 +21,7 @@ def test_to_gzip() -> None: def test_from_gzip() -> None: data_dict = from_gzip(FIXTURE_BYTES) - assert data_dict == {"name": "Alice"} + assert data_dict == {'name': 'Alice'} def test_read_gzip_time() -> None: @@ -30,6 +30,6 @@ def test_read_gzip_time() -> None: def test_read_gzip_time_non_zero() -> None: current_time = int(time()) - data_bytes = to_gzip({"name": "Alice"}, mtime=current_time) + data_bytes = to_gzip({'name': 'Alice'}, mtime=current_time) assert read_gzip_time(data_bytes) == current_time From 40076f318293e968181dd50cde3b19f4716e5857 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 14 Feb 2025 16:45:16 +0100 Subject: [PATCH 03/18] fix: make linter happy --- src/apify/scrapy/cache.py | 49 +++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/src/apify/scrapy/cache.py b/src/apify/scrapy/cache.py index b4f3a417..b084bd62 100644 --- a/src/apify/scrapy/cache.py +++ b/src/apify/scrapy/cache.py @@ -40,6 +40,7 @@ def __init__(self, settings: BaseSettings) -> None: self._async_thread: AsyncThread | None = None def open_spider(self, spider: Spider) -> None: + """Open the cache storage for a spider.""" logger.debug('Using Apify key value cache storage', extra={'spider': spider}) self.spider = spider self._fingerprinter = spider.crawler.request_fingerprinter @@ -57,16 +58,19 @@ async def open_kv() -> KeyValueStore: logger.debug(f"Opening cache storage's {kv_name!r} key value store") self._kv = self._async_thread.run_coro(open_kv()) - def close_spider(self, spider: Spider, current_time: int | None = None) -> None: - assert self._async_thread is not None, 'Async thread not initialized' + def close_spider(self, _: Spider, current_time: int | None = None) -> None: + """Close the cache storage for a spider.""" + if self._async_thread is None: + raise ValueError('Async thread not initialized') logger.info(f'Cleaning up cache items (max {self.expiration_max_items})') - if 0 < self.expiration_secs: + if self.expiration_secs > 0: if current_time is None: current_time = int(time()) async def expire_kv() -> None: - assert self._kv is not None, 'Key value store not initialized' + if self._kv is None: + raise ValueError('Key value store not initialized') i = 0 async for item in self._kv.iterate_keys(): value = await self._kv.get_value(item.key) @@ -97,10 +101,14 @@ async def expire_kv() -> None: finally: logger.debug('Cache storage closed') - def retrieve_response(self, spider: Spider, request: Request, current_time: int | None = None) -> Response | None: - assert self._async_thread is not None, 'Async thread not initialized' - assert self._kv is not None, 'Key value store not initialized' - assert self._fingerprinter is not None, 'Request fingerprinter not initialized' + def retrieve_response(self, _: Spider, request: Request, current_time: int | None = None) -> Response | None: + """Retrieve a response from the cache storage.""" + if self._async_thread is None: + raise ValueError('Async thread not initialized') + if self._kv is None: + raise ValueError('Key value store not initialized') + if self._fingerprinter is None: + raise ValueError('Request fingerprinter not initialized') key = self._fingerprinter.fingerprint(request).hex() value = self._async_thread.run_coro(self._kv.get_value(key)) @@ -125,10 +133,14 @@ def retrieve_response(self, spider: Spider, request: Request, current_time: int logger.debug('Cache hit', extra={'request': request}) return respcls(url=url, headers=headers, status=status, body=body) - def store_response(self, spider: Spider, request: Request, response: Response) -> None: - assert self._async_thread is not None, 'Async thread not initialized' - assert self._kv is not None, 'Key value store not initialized' - assert self._fingerprinter is not None, 'Request fingerprinter not initialized' + def store_response(self, _: Spider, request: Request, response: Response) -> None: + """Store a response in the cache storage.""" + if self._async_thread is None: + raise ValueError('Async thread not initialized') + if self._kv is None: + raise ValueError('Key value store not initialized') + if self._fingerprinter is None: + raise ValueError('Request fingerprinter not initialized') key = self._fingerprinter.fingerprint(request).hex() data = { @@ -143,20 +155,21 @@ def store_response(self, spider: Spider, request: Request, response: Response) - def to_gzip(data: dict, mtime: int | None = None) -> bytes: """Dump a dictionary to a gzip-compressed byte stream.""" - with io.BytesIO() as byte_stream: - with gzip.GzipFile(fileobj=byte_stream, mode='wb', mtime=mtime) as gzip_file: - pickle.dump(data, gzip_file, protocol=4) - return byte_stream.getvalue() + with io.BytesIO() as byte_stream, gzip.GzipFile(fileobj=byte_stream, mode='wb', mtime=mtime) as gzip_file: + pickle.dump(data, gzip_file, protocol=4) + return byte_stream.getvalue() def from_gzip(gzip_bytes: bytes) -> dict: """Load a dictionary from a gzip-compressed byte stream.""" with io.BytesIO(gzip_bytes) as byte_stream, gzip.GzipFile(fileobj=byte_stream, mode='rb') as gzip_file: - return pickle.load(gzip_file) + data: dict = pickle.load(gzip_file) + return data def read_gzip_time(gzip_bytes: bytes) -> int: """Read the modification time from a gzip-compressed byte stream without decompressing the data.""" header = gzip_bytes[:10] header_components = struct.unpack(' Date: Fri, 14 Feb 2025 17:12:51 +0100 Subject: [PATCH 04/18] fix: return back nested syntax This error was introduced when I tried to make all the linters happy, uh. --- src/apify/scrapy/cache.py | 7 ++++--- tests/unit/scrapy/test_cache.py | 10 ++++++---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/apify/scrapy/cache.py b/src/apify/scrapy/cache.py index b084bd62..eb7f0a0d 100644 --- a/src/apify/scrapy/cache.py +++ b/src/apify/scrapy/cache.py @@ -155,9 +155,10 @@ def store_response(self, _: Spider, request: Request, response: Response) -> Non def to_gzip(data: dict, mtime: int | None = None) -> bytes: """Dump a dictionary to a gzip-compressed byte stream.""" - with io.BytesIO() as byte_stream, gzip.GzipFile(fileobj=byte_stream, mode='wb', mtime=mtime) as gzip_file: - pickle.dump(data, gzip_file, protocol=4) - return byte_stream.getvalue() + with io.BytesIO() as byte_stream: + with gzip.GzipFile(fileobj=byte_stream, mode='wb', mtime=mtime) as gzip_file: + pickle.dump(data, gzip_file, protocol=4) + return byte_stream.getvalue() def from_gzip(gzip_bytes: bytes) -> dict: diff --git a/tests/unit/scrapy/test_cache.py b/tests/unit/scrapy/test_cache.py index 62832793..40106fe4 100644 --- a/tests/unit/scrapy/test_cache.py +++ b/tests/unit/scrapy/test_cache.py @@ -2,6 +2,8 @@ from apify.scrapy.cache import from_gzip, read_gzip_time, to_gzip +FIXTURE_DICT = {'name': 'Alice'} + FIXTURE_BYTES = ( b'\x1f\x8b\x08\x00\x00\x00\x00\x00\x02\xffk`\x99*\xcc\x00\x01\xb5SzX\xf2\x12s' b'S\xa7\xf4\xb0:\xe6d&\xa7N)\xd6\x03\x00\x1c\xe8U\x9c\x1e\x00\x00\x00' @@ -9,11 +11,11 @@ def test_gzip() -> None: - assert from_gzip(to_gzip({'name': 'Alice'})) == {'name': 'Alice'} + assert from_gzip(to_gzip(FIXTURE_DICT)) == FIXTURE_DICT def test_to_gzip() -> None: - data_bytes = to_gzip({'name': 'Alice'}, mtime=0) + data_bytes = to_gzip(FIXTURE_DICT, mtime=0) assert data_bytes == FIXTURE_BYTES @@ -21,7 +23,7 @@ def test_to_gzip() -> None: def test_from_gzip() -> None: data_dict = from_gzip(FIXTURE_BYTES) - assert data_dict == {'name': 'Alice'} + assert data_dict == FIXTURE_DICT def test_read_gzip_time() -> None: @@ -30,6 +32,6 @@ def test_read_gzip_time() -> None: def test_read_gzip_time_non_zero() -> None: current_time = int(time()) - data_bytes = to_gzip({'name': 'Alice'}, mtime=current_time) + data_bytes = to_gzip(FIXTURE_DICT, mtime=current_time) assert read_gzip_time(data_bytes) == current_time From ece55b124d54ccc21e2b8df9f7c704464f0a94c2 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Mon, 10 Mar 2025 14:19:17 +0100 Subject: [PATCH 05/18] feat: introduce the extensions package --- src/apify/scrapy/extensions/__init__.py | 0 src/apify/scrapy/{ => extensions}/cache.py | 2 +- tests/unit/scrapy/test_cache.py | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) create mode 100644 src/apify/scrapy/extensions/__init__.py rename src/apify/scrapy/{ => extensions}/cache.py (99%) diff --git a/src/apify/scrapy/extensions/__init__.py b/src/apify/scrapy/extensions/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/apify/scrapy/cache.py b/src/apify/scrapy/extensions/cache.py similarity index 99% rename from src/apify/scrapy/cache.py rename to src/apify/scrapy/extensions/cache.py index eb7f0a0d..130a2089 100644 --- a/src/apify/scrapy/cache.py +++ b/src/apify/scrapy/extensions/cache.py @@ -11,7 +11,7 @@ from scrapy.http.headers import Headers from scrapy.responsetypes import responsetypes -from ._async_thread import AsyncThread +from .._async_thread import AsyncThread from apify import Configuration from apify.apify_storage_client import ApifyStorageClient from apify.storages import KeyValueStore diff --git a/tests/unit/scrapy/test_cache.py b/tests/unit/scrapy/test_cache.py index 40106fe4..26799a95 100644 --- a/tests/unit/scrapy/test_cache.py +++ b/tests/unit/scrapy/test_cache.py @@ -1,6 +1,6 @@ from time import time -from apify.scrapy.cache import from_gzip, read_gzip_time, to_gzip +from apify.scrapy.extensions.cache import from_gzip, read_gzip_time, to_gzip FIXTURE_DICT = {'name': 'Alice'} From 468d3d2a2eaaf8ff8418b569c34ab7b785fc96f0 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Mon, 10 Mar 2025 14:20:54 +0100 Subject: [PATCH 06/18] refactor: make linter happy --- src/apify/scrapy/extensions/cache.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/apify/scrapy/extensions/cache.py b/src/apify/scrapy/extensions/cache.py index 130a2089..6c211bc2 100644 --- a/src/apify/scrapy/extensions/cache.py +++ b/src/apify/scrapy/extensions/cache.py @@ -11,9 +11,9 @@ from scrapy.http.headers import Headers from scrapy.responsetypes import responsetypes -from .._async_thread import AsyncThread from apify import Configuration from apify.apify_storage_client import ApifyStorageClient +from apify.scrapy._async_thread import AsyncThread from apify.storages import KeyValueStore if TYPE_CHECKING: From 48ee0265e8b796149bfe72f4ed970fb342471208 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Mon, 10 Mar 2025 14:25:34 +0100 Subject: [PATCH 07/18] fix: don't use public properties --- src/apify/scrapy/extensions/cache.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/apify/scrapy/extensions/cache.py b/src/apify/scrapy/extensions/cache.py index 6c211bc2..b3dfe31b 100644 --- a/src/apify/scrapy/extensions/cache.py +++ b/src/apify/scrapy/extensions/cache.py @@ -32,9 +32,9 @@ class ApifyCacheStorage: """ def __init__(self, settings: BaseSettings) -> None: - self.expiration_max_items = 100 - self.expiration_secs: int = settings.getint('HTTPCACHE_EXPIRATION_SECS') - self.spider: Spider | None = None + self._expiration_max_items = 100 + self._expiration_secs: int = settings.getint('HTTPCACHE_EXPIRATION_SECS') + self._spider: Spider | None = None self._kv: KeyValueStore | None = None self._fingerprinter: RequestFingerprinterProtocol | None = None self._async_thread: AsyncThread | None = None @@ -42,7 +42,7 @@ def __init__(self, settings: BaseSettings) -> None: def open_spider(self, spider: Spider) -> None: """Open the cache storage for a spider.""" logger.debug('Using Apify key value cache storage', extra={'spider': spider}) - self.spider = spider + self._spider = spider self._fingerprinter = spider.crawler.request_fingerprinter kv_name = f'httpcache-{spider.name}' @@ -63,8 +63,8 @@ def close_spider(self, _: Spider, current_time: int | None = None) -> None: if self._async_thread is None: raise ValueError('Async thread not initialized') - logger.info(f'Cleaning up cache items (max {self.expiration_max_items})') - if self.expiration_secs > 0: + logger.info(f'Cleaning up cache items (max {self._expiration_max_items})') + if self._expiration_secs > 0: if current_time is None: current_time = int(time()) @@ -80,12 +80,12 @@ async def expire_kv() -> None: logger.warning(f'Malformed cache item {item.key}: {e}') await self._kv.set_value(item.key, None) else: - if self.expiration_secs < current_time - gzip_time: + if self._expiration_secs < current_time - gzip_time: logger.debug(f'Expired cache item {item.key}') await self._kv.set_value(item.key, None) else: logger.debug(f'Valid cache item {item.key}') - if i == self.expiration_max_items: + if i == self._expiration_max_items: break i += 1 @@ -119,7 +119,7 @@ def retrieve_response(self, _: Spider, request: Request, current_time: int | Non if current_time is None: current_time = int(time()) - if 0 < self.expiration_secs < current_time - read_gzip_time(value): + if 0 < self._expiration_secs < current_time - read_gzip_time(value): logger.debug('Cache expired', extra={'request': request}) return None From f6701db9ab95511b7f0c5ae3f1eb9361081a7a26 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Mon, 10 Mar 2025 14:27:11 +0100 Subject: [PATCH 08/18] fix: rename module to httpcache and fix tests location --- src/apify/scrapy/extensions/{cache.py => httpcache.py} | 0 tests/unit/scrapy/extensions/__init__.py | 0 .../unit/scrapy/{test_cache.py => extensions/test_httpcache.py} | 2 +- 3 files changed, 1 insertion(+), 1 deletion(-) rename src/apify/scrapy/extensions/{cache.py => httpcache.py} (100%) create mode 100644 tests/unit/scrapy/extensions/__init__.py rename tests/unit/scrapy/{test_cache.py => extensions/test_httpcache.py} (91%) diff --git a/src/apify/scrapy/extensions/cache.py b/src/apify/scrapy/extensions/httpcache.py similarity index 100% rename from src/apify/scrapy/extensions/cache.py rename to src/apify/scrapy/extensions/httpcache.py diff --git a/tests/unit/scrapy/extensions/__init__.py b/tests/unit/scrapy/extensions/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/scrapy/test_cache.py b/tests/unit/scrapy/extensions/test_httpcache.py similarity index 91% rename from tests/unit/scrapy/test_cache.py rename to tests/unit/scrapy/extensions/test_httpcache.py index 26799a95..b1c8ed1e 100644 --- a/tests/unit/scrapy/test_cache.py +++ b/tests/unit/scrapy/extensions/test_httpcache.py @@ -1,6 +1,6 @@ from time import time -from apify.scrapy.extensions.cache import from_gzip, read_gzip_time, to_gzip +from apify.scrapy.extensions.httpcache import from_gzip, read_gzip_time, to_gzip FIXTURE_DICT = {'name': 'Alice'} From 21ce5fc4b7004fbf701d83e1aba321d3eae5bb1a Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Mon, 10 Mar 2025 14:32:20 +0100 Subject: [PATCH 09/18] docs: improve docstring of ApifyCacheStorage, describe usage --- src/apify/scrapy/extensions/httpcache.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/apify/scrapy/extensions/httpcache.py b/src/apify/scrapy/extensions/httpcache.py index b3dfe31b..d2009f9d 100644 --- a/src/apify/scrapy/extensions/httpcache.py +++ b/src/apify/scrapy/extensions/httpcache.py @@ -28,7 +28,10 @@ class ApifyCacheStorage: """A Scrapy cache storage that uses the Apify `KeyValueStore` to store responses. - This cache storage requires the asyncio Twisted reactor to be installed. + It can be set as a storage for Scrapy's built-in `HttpCacheMiddleware`, which caches + responses to requests. See HTTPCache middleware settings (prefixed with `HTTPCACHE_`) + in the Scrapy documentation for more information. Requires the asyncio Twisted reactor + to be installed. """ def __init__(self, settings: BaseSettings) -> None: From c19a93e7d17e146c8a86ea651862d45feb7bb397 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Mon, 10 Mar 2025 14:34:46 +0100 Subject: [PATCH 10/18] refactor: rename kv to kvs per convention --- src/apify/scrapy/extensions/httpcache.py | 36 ++++++++++++------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/apify/scrapy/extensions/httpcache.py b/src/apify/scrapy/extensions/httpcache.py index d2009f9d..ec7caac6 100644 --- a/src/apify/scrapy/extensions/httpcache.py +++ b/src/apify/scrapy/extensions/httpcache.py @@ -38,7 +38,7 @@ def __init__(self, settings: BaseSettings) -> None: self._expiration_max_items = 100 self._expiration_secs: int = settings.getint('HTTPCACHE_EXPIRATION_SECS') self._spider: Spider | None = None - self._kv: KeyValueStore | None = None + self._kvs: KeyValueStore | None = None self._fingerprinter: RequestFingerprinterProtocol | None = None self._async_thread: AsyncThread | None = None @@ -47,19 +47,19 @@ def open_spider(self, spider: Spider) -> None: logger.debug('Using Apify key value cache storage', extra={'spider': spider}) self._spider = spider self._fingerprinter = spider.crawler.request_fingerprinter - kv_name = f'httpcache-{spider.name}' + kvs_name = f'httpcache-{spider.name}' - async def open_kv() -> KeyValueStore: + async def open_kvs() -> KeyValueStore: config = Configuration.get_global_configuration() if config.is_at_home: storage_client = ApifyStorageClient.from_config(config) - return await KeyValueStore.open(name=kv_name, storage_client=storage_client) - return await KeyValueStore.open(name=kv_name) + return await KeyValueStore.open(name=kvs_name, storage_client=storage_client) + return await KeyValueStore.open(name=kvs_name) logger.debug("Starting background thread for cache storage's event loop") self._async_thread = AsyncThread() - logger.debug(f"Opening cache storage's {kv_name!r} key value store") - self._kv = self._async_thread.run_coro(open_kv()) + logger.debug(f"Opening cache storage's {kvs_name!r} key value store") + self._kvs = self._async_thread.run_coro(open_kvs()) def close_spider(self, _: Spider, current_time: int | None = None) -> None: """Close the cache storage for a spider.""" @@ -71,28 +71,28 @@ def close_spider(self, _: Spider, current_time: int | None = None) -> None: if current_time is None: current_time = int(time()) - async def expire_kv() -> None: - if self._kv is None: + async def expire_kvs() -> None: + if self._kvs is None: raise ValueError('Key value store not initialized') i = 0 - async for item in self._kv.iterate_keys(): - value = await self._kv.get_value(item.key) + async for item in self._kvs.iterate_keys(): + value = await self._kvs.get_value(item.key) try: gzip_time = read_gzip_time(value) except Exception as e: logger.warning(f'Malformed cache item {item.key}: {e}') - await self._kv.set_value(item.key, None) + await self._kvs.set_value(item.key, None) else: if self._expiration_secs < current_time - gzip_time: logger.debug(f'Expired cache item {item.key}') - await self._kv.set_value(item.key, None) + await self._kvs.set_value(item.key, None) else: logger.debug(f'Valid cache item {item.key}') if i == self._expiration_max_items: break i += 1 - self._async_thread.run_coro(expire_kv()) + self._async_thread.run_coro(expire_kvs()) logger.debug('Closing cache storage') try: @@ -108,13 +108,13 @@ def retrieve_response(self, _: Spider, request: Request, current_time: int | Non """Retrieve a response from the cache storage.""" if self._async_thread is None: raise ValueError('Async thread not initialized') - if self._kv is None: + if self._kvs is None: raise ValueError('Key value store not initialized') if self._fingerprinter is None: raise ValueError('Request fingerprinter not initialized') key = self._fingerprinter.fingerprint(request).hex() - value = self._async_thread.run_coro(self._kv.get_value(key)) + value = self._async_thread.run_coro(self._kvs.get_value(key)) if value is None: logger.debug('Cache miss', extra={'request': request}) @@ -140,7 +140,7 @@ def store_response(self, _: Spider, request: Request, response: Response) -> Non """Store a response in the cache storage.""" if self._async_thread is None: raise ValueError('Async thread not initialized') - if self._kv is None: + if self._kvs is None: raise ValueError('Key value store not initialized') if self._fingerprinter is None: raise ValueError('Request fingerprinter not initialized') @@ -153,7 +153,7 @@ def store_response(self, _: Spider, request: Request, response: Response) -> Non 'body': response.body, } value = to_gzip(data) - self._async_thread.run_coro(self._kv.set_value(key, value)) + self._async_thread.run_coro(self._kvs.set_value(key, value)) def to_gzip(data: dict, mtime: int | None = None) -> bytes: From d8adf62f71efda929590af547a2f7fd33e1f0cd4 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Mon, 10 Mar 2025 14:54:58 +0100 Subject: [PATCH 11/18] feat: set HTTPCACHE_STORAGE in apply_apify_settings, document usage --- docs/02_guides/05_scrapy.mdx | 1 + docs/02_guides/code/scrapy_project/src/settings.py | 2 ++ src/apify/scrapy/utils.py | 3 +++ 3 files changed, 6 insertions(+) diff --git a/docs/02_guides/05_scrapy.mdx b/docs/02_guides/05_scrapy.mdx index 35b6bb5e..220715ad 100644 --- a/docs/02_guides/05_scrapy.mdx +++ b/docs/02_guides/05_scrapy.mdx @@ -40,6 +40,7 @@ The Apify SDK provides several custom components to support integration with the - [`apify.scrapy.ApifyScheduler`](https://docs.apify.com/sdk/python/reference/class/ApifyScheduler) - Replaces Scrapy's default [scheduler](https://docs.scrapy.org/en/latest/topics/scheduler.html) with one that uses Apify's [request queue](https://docs.apify.com/platform/storage/request-queue) for storing requests. It manages enqueuing, dequeuing, and maintaining the state and priority of requests. - [`apify.scrapy.ActorDatasetPushPipeline`](https://docs.apify.com/sdk/python/reference/class/ActorDatasetPushPipeline) - A Scrapy [item pipeline](https://docs.scrapy.org/en/latest/topics/item-pipeline.html) that pushes scraped items to Apify's [dataset](https://docs.apify.com/platform/storage/dataset). When enabled, every item produced by the spider is sent to the dataset. - [`apify.scrapy.ApifyHttpProxyMiddleware`](https://docs.apify.com/sdk/python/reference/class/ApifyHttpProxyMiddleware) - A Scrapy [middleware](https://docs.scrapy.org/en/latest/topics/downloader-middleware.html) that manages proxy configurations. This middleware replaces Scrapy's default `HttpProxyMiddleware` to facilitate the use of Apify's proxy service. +- [`apify.scrapy.extensions.httpcache.ApifyCacheStorage`](https://docs.apify.com/sdk/python/reference/class/ApifyCacheStorage) - A storage backend for the built-in Scrapy [middleware](https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#module-scrapy.downloadermiddlewares.httpcache) that manages caching responses to HTTP requests. This backend uses Apify's [key-value store](https://docs.apify.com/platform/storage/key-value-store). Don't forget to set `HTTPCACHE_ENABLED` and `HTTPCACHE_EXPIRATION_SECS` in your settings, otherwise no caching takes place. Additional helper functions in the [`apify.scrapy`](https://github.com/apify/apify-sdk-python/tree/master/src/apify/scrapy) subpackage include: diff --git a/docs/02_guides/code/scrapy_project/src/settings.py b/docs/02_guides/code/scrapy_project/src/settings.py index 62e11bfb..5c0e56e3 100644 --- a/docs/02_guides/code/scrapy_project/src/settings.py +++ b/docs/02_guides/code/scrapy_project/src/settings.py @@ -7,3 +7,5 @@ TELNETCONSOLE_ENABLED = False # Do not change the Twisted reactor unless you really know what you are doing. TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor' +HTTPCACHE_ENABLED = True +HTTPCACHE_EXPIRATION_SECS = 7200 diff --git a/src/apify/scrapy/utils.py b/src/apify/scrapy/utils.py index 860c1c33..c6e22cb6 100644 --- a/src/apify/scrapy/utils.py +++ b/src/apify/scrapy/utils.py @@ -44,6 +44,9 @@ def apply_apify_settings(*, settings: Settings | None = None, proxy_config: dict settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware'] = None settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyHttpProxyMiddleware'] = 750 + # Set the default HTTPCache middleware storage backend to ApifyCacheStorage + settings['HTTPCACHE_STORAGE'] = 'apify.scrapy.extensions.httpcache.ApifyCacheStorage' + # Store the proxy configuration settings['APIFY_PROXY_SETTINGS'] = proxy_config From 1b02e455ca111ef38f742c88f7622442926c4263 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Mon, 10 Mar 2025 14:57:41 +0100 Subject: [PATCH 12/18] docs: improve stylistics --- docs/02_guides/05_scrapy.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/02_guides/05_scrapy.mdx b/docs/02_guides/05_scrapy.mdx index 220715ad..dca075ba 100644 --- a/docs/02_guides/05_scrapy.mdx +++ b/docs/02_guides/05_scrapy.mdx @@ -40,7 +40,7 @@ The Apify SDK provides several custom components to support integration with the - [`apify.scrapy.ApifyScheduler`](https://docs.apify.com/sdk/python/reference/class/ApifyScheduler) - Replaces Scrapy's default [scheduler](https://docs.scrapy.org/en/latest/topics/scheduler.html) with one that uses Apify's [request queue](https://docs.apify.com/platform/storage/request-queue) for storing requests. It manages enqueuing, dequeuing, and maintaining the state and priority of requests. - [`apify.scrapy.ActorDatasetPushPipeline`](https://docs.apify.com/sdk/python/reference/class/ActorDatasetPushPipeline) - A Scrapy [item pipeline](https://docs.scrapy.org/en/latest/topics/item-pipeline.html) that pushes scraped items to Apify's [dataset](https://docs.apify.com/platform/storage/dataset). When enabled, every item produced by the spider is sent to the dataset. - [`apify.scrapy.ApifyHttpProxyMiddleware`](https://docs.apify.com/sdk/python/reference/class/ApifyHttpProxyMiddleware) - A Scrapy [middleware](https://docs.scrapy.org/en/latest/topics/downloader-middleware.html) that manages proxy configurations. This middleware replaces Scrapy's default `HttpProxyMiddleware` to facilitate the use of Apify's proxy service. -- [`apify.scrapy.extensions.httpcache.ApifyCacheStorage`](https://docs.apify.com/sdk/python/reference/class/ApifyCacheStorage) - A storage backend for the built-in Scrapy [middleware](https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#module-scrapy.downloadermiddlewares.httpcache) that manages caching responses to HTTP requests. This backend uses Apify's [key-value store](https://docs.apify.com/platform/storage/key-value-store). Don't forget to set `HTTPCACHE_ENABLED` and `HTTPCACHE_EXPIRATION_SECS` in your settings, otherwise no caching takes place. +- [`apify.scrapy.extensions.httpcache.ApifyCacheStorage`](https://docs.apify.com/sdk/python/reference/class/ApifyCacheStorage) - A storage backend for Scrapy's built-in [HTTP cache middleware](https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#module-scrapy.downloadermiddlewares.httpcache). This backend uses Apify's [key-value store](https://docs.apify.com/platform/storage/key-value-store). Make sure to set `HTTPCACHE_ENABLED` and `HTTPCACHE_EXPIRATION_SECS` in your settings, or caching won't work. Additional helper functions in the [`apify.scrapy`](https://github.com/apify/apify-sdk-python/tree/master/src/apify/scrapy) subpackage include: From b6969baebdb5a00556275124b3a8cc8f5bafaad8 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Mon, 10 Mar 2025 15:08:44 +0100 Subject: [PATCH 13/18] docs: document workaround for https://github.com/apify/actor-templates/issues/303 --- docs/02_guides/05_scrapy.mdx | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/02_guides/05_scrapy.mdx b/docs/02_guides/05_scrapy.mdx index dca075ba..1db33c55 100644 --- a/docs/02_guides/05_scrapy.mdx +++ b/docs/02_guides/05_scrapy.mdx @@ -95,6 +95,12 @@ The following example demonstrates a Scrapy Actor that scrapes page titles and e +## Dealing with ‘imminent migration to another host’ + +Under some circumstances the platform may decide to [migrate your Actor](https://docs.apify.com/academy/expert-scraping-with-apify/migrations-maintaining-state) from one piece of infrastructure to another while it's in progress. While [Crawlee](https://crawlee.dev/python)-based projects have the ability to pause and resume their work after restart, it may be a challenge to achieve the same with a Scrapy-based project. + +As a workaround to this problem (tracked as [apify/actor-templates#303](https://github.com/apify/actor-templates/issues/303)), turn on caching with `HTTPCACHE_ENABLED` and set `HTTPCACHE_EXPIRATION_SECS` to at least a few minutes, a conrete value depending on your use case. If your Actor gets migrated and restarted, the subsequent run will hit the cache, so it'll be fast and won't consume unnecessary resources. + ## Conclusion In this guide you learned how to use Scrapy in Apify Actors. You can now start building your own web scraping projects using Scrapy, the Apify SDK and host them on the Apify platform. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! From b40d5550124784912ccb18a61977dcd33318878b Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Mon, 10 Mar 2025 15:11:26 +0100 Subject: [PATCH 14/18] docs: improve stylistics --- docs/02_guides/05_scrapy.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/02_guides/05_scrapy.mdx b/docs/02_guides/05_scrapy.mdx index 1db33c55..d66ffb83 100644 --- a/docs/02_guides/05_scrapy.mdx +++ b/docs/02_guides/05_scrapy.mdx @@ -97,9 +97,9 @@ The following example demonstrates a Scrapy Actor that scrapes page titles and e ## Dealing with ‘imminent migration to another host’ -Under some circumstances the platform may decide to [migrate your Actor](https://docs.apify.com/academy/expert-scraping-with-apify/migrations-maintaining-state) from one piece of infrastructure to another while it's in progress. While [Crawlee](https://crawlee.dev/python)-based projects have the ability to pause and resume their work after restart, it may be a challenge to achieve the same with a Scrapy-based project. +Under some circumstances, the platform may decide to [migrate your Actor](https://docs.apify.com/academy/expert-scraping-with-apify/migrations-maintaining-state) from one piece of infrastructure to another while it's in progress. While [Crawlee](https://crawlee.dev/python)-based projects can pause and resume their work after a restart, achieving the same with a Scrapy-based project can be challenging. -As a workaround to this problem (tracked as [apify/actor-templates#303](https://github.com/apify/actor-templates/issues/303)), turn on caching with `HTTPCACHE_ENABLED` and set `HTTPCACHE_EXPIRATION_SECS` to at least a few minutes, a conrete value depending on your use case. If your Actor gets migrated and restarted, the subsequent run will hit the cache, so it'll be fast and won't consume unnecessary resources. +As a workaround for this issue (tracked as [apify/actor-templates#303](https://github.com/apify/actor-templates/issues/303)), turn on caching with `HTTPCACHE_ENABLED` and set `HTTPCACHE_EXPIRATION_SECS` to at least a few minutes—the exact value depends on your use case. If your Actor gets migrated and restarted, the subsequent run will hit the cache, making it fast and avoiding unnecessary resource consumption. ## Conclusion From d07cbe17056efd2ab7133afc01f2bba92a2ae546 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 14 Mar 2025 11:44:32 +0100 Subject: [PATCH 15/18] fix: change public path of the ApifyCacheStorage class --- docs/02_guides/05_scrapy.mdx | 2 +- src/apify/scrapy/extensions/__init__.py | 3 +++ src/apify/scrapy/extensions/{httpcache.py => _httpcache.py} | 0 src/apify/scrapy/utils.py | 2 +- tests/unit/scrapy/extensions/test_httpcache.py | 2 +- 5 files changed, 6 insertions(+), 3 deletions(-) rename src/apify/scrapy/extensions/{httpcache.py => _httpcache.py} (100%) diff --git a/docs/02_guides/05_scrapy.mdx b/docs/02_guides/05_scrapy.mdx index d66ffb83..523a2423 100644 --- a/docs/02_guides/05_scrapy.mdx +++ b/docs/02_guides/05_scrapy.mdx @@ -40,7 +40,7 @@ The Apify SDK provides several custom components to support integration with the - [`apify.scrapy.ApifyScheduler`](https://docs.apify.com/sdk/python/reference/class/ApifyScheduler) - Replaces Scrapy's default [scheduler](https://docs.scrapy.org/en/latest/topics/scheduler.html) with one that uses Apify's [request queue](https://docs.apify.com/platform/storage/request-queue) for storing requests. It manages enqueuing, dequeuing, and maintaining the state and priority of requests. - [`apify.scrapy.ActorDatasetPushPipeline`](https://docs.apify.com/sdk/python/reference/class/ActorDatasetPushPipeline) - A Scrapy [item pipeline](https://docs.scrapy.org/en/latest/topics/item-pipeline.html) that pushes scraped items to Apify's [dataset](https://docs.apify.com/platform/storage/dataset). When enabled, every item produced by the spider is sent to the dataset. - [`apify.scrapy.ApifyHttpProxyMiddleware`](https://docs.apify.com/sdk/python/reference/class/ApifyHttpProxyMiddleware) - A Scrapy [middleware](https://docs.scrapy.org/en/latest/topics/downloader-middleware.html) that manages proxy configurations. This middleware replaces Scrapy's default `HttpProxyMiddleware` to facilitate the use of Apify's proxy service. -- [`apify.scrapy.extensions.httpcache.ApifyCacheStorage`](https://docs.apify.com/sdk/python/reference/class/ApifyCacheStorage) - A storage backend for Scrapy's built-in [HTTP cache middleware](https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#module-scrapy.downloadermiddlewares.httpcache). This backend uses Apify's [key-value store](https://docs.apify.com/platform/storage/key-value-store). Make sure to set `HTTPCACHE_ENABLED` and `HTTPCACHE_EXPIRATION_SECS` in your settings, or caching won't work. +- [`apify.scrapy.extensions.ApifyCacheStorage`](https://docs.apify.com/sdk/python/reference/class/ApifyCacheStorage) - A storage backend for Scrapy's built-in [HTTP cache middleware](https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#module-scrapy.downloadermiddlewares.httpcache). This backend uses Apify's [key-value store](https://docs.apify.com/platform/storage/key-value-store). Make sure to set `HTTPCACHE_ENABLED` and `HTTPCACHE_EXPIRATION_SECS` in your settings, or caching won't work. Additional helper functions in the [`apify.scrapy`](https://github.com/apify/apify-sdk-python/tree/master/src/apify/scrapy) subpackage include: diff --git a/src/apify/scrapy/extensions/__init__.py b/src/apify/scrapy/extensions/__init__.py index e69de29b..e9bccd1f 100644 --- a/src/apify/scrapy/extensions/__init__.py +++ b/src/apify/scrapy/extensions/__init__.py @@ -0,0 +1,3 @@ +from apify.scrapy.extensions._httpcache import ApifyCacheStorage + +__all__ = ['ApifyCacheStorage'] diff --git a/src/apify/scrapy/extensions/httpcache.py b/src/apify/scrapy/extensions/_httpcache.py similarity index 100% rename from src/apify/scrapy/extensions/httpcache.py rename to src/apify/scrapy/extensions/_httpcache.py diff --git a/src/apify/scrapy/utils.py b/src/apify/scrapy/utils.py index c6e22cb6..98af8d87 100644 --- a/src/apify/scrapy/utils.py +++ b/src/apify/scrapy/utils.py @@ -45,7 +45,7 @@ def apply_apify_settings(*, settings: Settings | None = None, proxy_config: dict settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyHttpProxyMiddleware'] = 750 # Set the default HTTPCache middleware storage backend to ApifyCacheStorage - settings['HTTPCACHE_STORAGE'] = 'apify.scrapy.extensions.httpcache.ApifyCacheStorage' + settings['HTTPCACHE_STORAGE'] = 'apify.scrapy.extensions.ApifyCacheStorage' # Store the proxy configuration settings['APIFY_PROXY_SETTINGS'] = proxy_config diff --git a/tests/unit/scrapy/extensions/test_httpcache.py b/tests/unit/scrapy/extensions/test_httpcache.py index b1c8ed1e..b146d449 100644 --- a/tests/unit/scrapy/extensions/test_httpcache.py +++ b/tests/unit/scrapy/extensions/test_httpcache.py @@ -1,6 +1,6 @@ from time import time -from apify.scrapy.extensions.httpcache import from_gzip, read_gzip_time, to_gzip +from apify.scrapy.extensions._httpcache import from_gzip, read_gzip_time, to_gzip FIXTURE_DICT = {'name': 'Alice'} From 41af4dffc6b36bac813e0b3950e0b6e7addac00d Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 14 Mar 2025 12:50:04 +0100 Subject: [PATCH 16/18] Update run_code_checks.yaml --- .github/workflows/run_code_checks.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run_code_checks.yaml b/.github/workflows/run_code_checks.yaml index dd9b4d23..b424a17b 100644 --- a/.github/workflows/run_code_checks.yaml +++ b/.github/workflows/run_code_checks.yaml @@ -26,5 +26,5 @@ jobs: integration_tests: name: Integration tests needs: [lint_check, type_check, unit_tests] - uses: apify/workflows/.github/workflows/python_integration_tests.yaml@main + uses: apify/workflows/.github/workflows/python_integration_tests.yaml@fix-integration-tests-from-forks secrets: inherit From 4ff56d7ebc57ec9cfb22cf201276a8e63b8e6d74 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Tue, 18 Mar 2025 09:23:10 +0100 Subject: [PATCH 17/18] feat: support wider variety of spider names --- src/apify/scrapy/extensions/_httpcache.py | 13 ++++++- .../unit/scrapy/extensions/test_httpcache.py | 35 ++++++++++++++++++- 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/src/apify/scrapy/extensions/_httpcache.py b/src/apify/scrapy/extensions/_httpcache.py index ec7caac6..8921ad4d 100644 --- a/src/apify/scrapy/extensions/_httpcache.py +++ b/src/apify/scrapy/extensions/_httpcache.py @@ -3,6 +3,7 @@ import gzip import io import pickle +import re import struct from logging import getLogger from time import time @@ -47,7 +48,7 @@ def open_spider(self, spider: Spider) -> None: logger.debug('Using Apify key value cache storage', extra={'spider': spider}) self._spider = spider self._fingerprinter = spider.crawler.request_fingerprinter - kvs_name = f'httpcache-{spider.name}' + kvs_name = get_kvs_name(spider.name) async def open_kvs() -> KeyValueStore: config = Configuration.get_global_configuration() @@ -177,3 +178,13 @@ def read_gzip_time(gzip_bytes: bytes) -> int: header_components = struct.unpack(' str: + """Get the key value store name for a spider.""" + slug = re.sub(r'[^a-zA-Z0-9-]', '-', spider_name) + slug = re.sub(r'-+', '-', slug) + slug = slug.strip('-') + if not slug: + raise ValueError(f'Unsupported spider name: {spider_name!r} (slug: {slug!r})') + return f'httpcache-{slug}' diff --git a/tests/unit/scrapy/extensions/test_httpcache.py b/tests/unit/scrapy/extensions/test_httpcache.py index b146d449..a0d3602c 100644 --- a/tests/unit/scrapy/extensions/test_httpcache.py +++ b/tests/unit/scrapy/extensions/test_httpcache.py @@ -1,6 +1,8 @@ from time import time -from apify.scrapy.extensions._httpcache import from_gzip, read_gzip_time, to_gzip +import pytest + +from apify.scrapy.extensions._httpcache import from_gzip, get_kvs_name, read_gzip_time, to_gzip FIXTURE_DICT = {'name': 'Alice'} @@ -35,3 +37,34 @@ def test_read_gzip_time_non_zero() -> None: data_bytes = to_gzip(FIXTURE_DICT, mtime=current_time) assert read_gzip_time(data_bytes) == current_time + + +@pytest.mark.parametrize( + ('spider_name', 'expected'), + [ + ('test', 'httpcache-test'), + ('123', 'httpcache-123'), + ('test-spider', 'httpcache-test-spider'), + ('test_spider', 'httpcache-test-spider'), + ('test spider', 'httpcache-test-spider'), + ('test👻spider', 'httpcache-test-spider'), + ('test@spider', 'httpcache-test-spider'), + (' test spider ', 'httpcache-test-spider'), + ('testspider.com', 'httpcache-testspider-com'), + ], +) +def test_get_kvs_name(spider_name: str, expected: str) -> None: + assert get_kvs_name(spider_name) == expected + + +@pytest.mark.parametrize( + ('spider_name'), + [ + '', + '-', + '-@-/-', + ], +) +def test_get_kvs_name_raises(spider_name: str) -> None: + with pytest.raises(ValueError, match='Unsupported spider name'): + assert get_kvs_name(spider_name) From 22af7acf189712d4e2ff7fb80a52c9af22f84d51 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Tue, 18 Mar 2025 14:17:01 +0100 Subject: [PATCH 18/18] fix: truncate too long kvs names --- src/apify/scrapy/extensions/_httpcache.py | 28 +++++++++++++++++-- .../unit/scrapy/extensions/test_httpcache.py | 1 + 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/src/apify/scrapy/extensions/_httpcache.py b/src/apify/scrapy/extensions/_httpcache.py index 8921ad4d..509c4d8a 100644 --- a/src/apify/scrapy/extensions/_httpcache.py +++ b/src/apify/scrapy/extensions/_httpcache.py @@ -180,11 +180,33 @@ def read_gzip_time(gzip_bytes: bytes) -> int: return mtime -def get_kvs_name(spider_name: str) -> str: - """Get the key value store name for a spider.""" +def get_kvs_name(spider_name: str, max_length: int = 60) -> str: + """Get the key value store name for a spider. + + The key value store name is derived from the spider name by replacing all special characters + with hyphens and trimming leading and trailing hyphens. The resulting name is prefixed with + 'httpcache-' and truncated to the maximum length. + + The documentation + [about storages](https://docs.apify.com/platform/storage/usage#named-and-unnamed-storages) + mentions that names can be up to 63 characters long, so the default max length is set to 60. + + Such naming isn't unique per spider, but should be sufficiently unique for most use cases. + The name of the key value store should indicate to which spider it belongs, e.g. in + the listing in the Apify's console. + + Args: + spider_name: Value of the Spider instance's name attribute. + max_length: Maximum length of the key value store name. + + Returns: Key value store name. + + Raises: + ValueError: If the spider name contains only special characters. + """ slug = re.sub(r'[^a-zA-Z0-9-]', '-', spider_name) slug = re.sub(r'-+', '-', slug) slug = slug.strip('-') if not slug: raise ValueError(f'Unsupported spider name: {spider_name!r} (slug: {slug!r})') - return f'httpcache-{slug}' + return f'httpcache-{slug}'[:max_length] diff --git a/tests/unit/scrapy/extensions/test_httpcache.py b/tests/unit/scrapy/extensions/test_httpcache.py index a0d3602c..51adbd63 100644 --- a/tests/unit/scrapy/extensions/test_httpcache.py +++ b/tests/unit/scrapy/extensions/test_httpcache.py @@ -51,6 +51,7 @@ def test_read_gzip_time_non_zero() -> None: ('test@spider', 'httpcache-test-spider'), (' test spider ', 'httpcache-test-spider'), ('testspider.com', 'httpcache-testspider-com'), + ('t' * 100, 'httpcache-tttttttttttttttttttttttttttttttttttttttttttttttttt'), ], ) def test_get_kvs_name(spider_name: str, expected: str) -> None: