-
Notifications
You must be signed in to change notification settings - Fork 22
Expand file tree
/
Copy pathhandler.py
More file actions
115 lines (102 loc) · 4.38 KB
/
handler.py
File metadata and controls
115 lines (102 loc) · 4.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import json
import logging
import os
from typing import Any, Dict, Generator, Optional, Union
from scrapy import Spider
from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
from scrapy.crawler import Crawler
from scrapy.exceptions import IgnoreRequest, NotConfigured
from scrapy.http import Request
from scrapy.settings import Settings
from scrapy.utils.defer import deferred_from_coro
from scrapy.utils.reactor import verify_installed_reactor
from twisted.internet.defer import Deferred, inlineCallbacks
from zyte_api.aio.client import AsyncClient, create_session
from zyte_api.aio.errors import RequestError
from .responses import ZyteAPIResponse, ZyteAPITextResponse, process_response
logger = logging.getLogger(__name__)
class ScrapyZyteAPIDownloadHandler(HTTPDownloadHandler):
def __init__(
self, settings: Settings, crawler: Crawler, client: AsyncClient = None
):
super().__init__(settings=settings, crawler=crawler)
self._client: AsyncClient = client if client else AsyncClient()
verify_installed_reactor(
"twisted.internet.asyncioreactor.AsyncioSelectorReactor"
)
self._stats = crawler.stats
self._job_id = crawler.settings.get("JOB")
self._zyte_api_default_params = settings.getdict("ZYTE_API_DEFAULT_PARAMS")
self._session = create_session()
@classmethod
def from_crawler(cls, crawler):
zyte_api_key = crawler.settings.get("ZYTE_API_KEY") or os.getenv("ZYTE_API_KEY")
if not zyte_api_key:
logger.warning(
"'ZYTE_API_KEY' must be set in the spider settings or env var "
"in order for ScrapyZyteAPIDownloadHandler to work."
)
raise NotConfigured
logger.info(f"Using Zyte API Key: {zyte_api_key[:7]}")
client = AsyncClient(api_key=zyte_api_key)
return cls(crawler.settings, crawler, client)
def download_request(self, request: Request, spider: Spider) -> Deferred:
if request.meta.get("zyte_api"):
return deferred_from_coro(self._download_request(request, spider))
else:
return super().download_request(request, spider)
async def _download_request(
self, request: Request, spider: Spider
) -> Optional[Union[ZyteAPITextResponse, ZyteAPIResponse]]:
api_params: Dict[str, Any] = self._zyte_api_default_params or {}
try:
api_params.update(request.meta.get("zyte_api") or {})
except TypeError:
logger.error(
f"zyte_api parameters in the request meta should be "
f"provided as dictionary, got {type(request.meta.get('zyte_api'))} "
f"instead ({request.url})."
)
raise IgnoreRequest()
# Define url by default
api_data = {**{"url": request.url}, **api_params}
if self._job_id is not None:
api_data["jobId"] = self._job_id
try:
api_response = await self._client.request_raw(
api_data, session=self._session
)
except RequestError as er:
error_message = self._get_request_error_message(er)
logger.error(
f"Got Zyte API error ({er.status}) while processing URL ({request.url}): {error_message}"
)
raise IgnoreRequest()
except Exception as er:
logger.error(
f"Got an error when processing Zyte API request ({request.url}): {er}"
)
raise IgnoreRequest()
self._stats.inc_value("scrapy-zyte-api/request_count")
return process_response(api_response, request)
@inlineCallbacks
def close(self) -> Generator:
yield super().close()
yield deferred_from_coro(self._close())
async def _close(self) -> None: # NOQA
await self._session.close()
@staticmethod
def _get_request_error_message(error: RequestError) -> str:
if hasattr(error, "message"):
base_message = error.message
else:
base_message = str(error)
if not hasattr(error, "response_content"):
return base_message
try:
error_data = json.loads(error.response_content.decode("utf-8"))
except (AttributeError, TypeError, ValueError):
return base_message
if error_data.get("detail"):
return error_data["detail"]
return base_message