Skip to content

Commit 055a5a6

Browse files
authored
Track in stats which fields from Zyte API automatic extraction are not overridden (#202)
1 parent 42e81f6 commit 055a5a6

File tree

6 files changed

+702
-17
lines changed

6 files changed

+702
-17
lines changed

CHANGES.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,13 @@
11
Changes
22
=======
33

4+
N.N.N (YYYY-MM-DD)
5+
------------------
6+
7+
* ``scrapy-zyte-api[provider]`` now requires zyte-common-items >= 0.20.0.
8+
9+
* Added the :setting:`ZYTE_API_AUTO_FIELD_STATS` setting.
10+
411
0.21.0 (2024-07-02)
512
-------------------
613

docs/reference/settings.rst

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,40 @@ Settings
66

77
:ref:`Settings <topics-settings>` for scrapy-zyte-api.
88

9+
.. setting:: ZYTE_API_AUTO_FIELD_STATS
10+
11+
ZYTE_API_AUTO_FIELD_STATS
12+
=========================
13+
14+
Default: ``False``
15+
16+
Enables stats that indicate which requested fields :ref:`obtained through
17+
scrapy-poet integration <scrapy-poet>` come directly from
18+
:ref:`zyte-api-extract`.
19+
20+
If for any request no page object class is used to override
21+
:ref:`zyte-api-extract` fields for a given item type, the following stat is
22+
set:
23+
24+
.. code-block:: python
25+
26+
"scrapy-zyte-api/auto_fields/<item class import path>": "(all fields)"
27+
28+
.. note:: A literal ``(all fields)`` string is used as value, not a list with
29+
all fields.
30+
31+
If for any request a custom page object class is used to override some
32+
:ref:`zyte-api-extract` fields, the following stat is set:
33+
34+
.. code-block:: python
35+
36+
"scrapy-zyte-api/auto_fields/<override class import path>": (
37+
"<space-separated list of fields not overridden>"
38+
)
39+
40+
.. note:: :func:`zyte_common_items.fields.is_auto_field` is used to determine
41+
whether a field has been overridden or not.
42+
943
.. setting:: ZYTE_API_AUTOMAP_PARAMS
1044

1145
ZYTE_API_AUTOMAP_PARAMS

scrapy_zyte_api/providers.py

Lines changed: 64 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Any, Callable, Dict, List, Optional, Sequence, Set
1+
from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Type, cast
22

33
from andi.typeutils import is_typing_annotated, strip_annotated
44
from scrapy import Request
@@ -13,16 +13,26 @@
1313
HttpResponseHeaders,
1414
)
1515
from web_poet.annotated import AnnotatedInstance
16+
from web_poet.fields import get_fields_dict
17+
from web_poet.utils import get_fq_class_name
1618
from zyte_common_items import (
1719
Article,
1820
ArticleList,
1921
ArticleNavigation,
22+
AutoArticleListPage,
23+
AutoArticleNavigationPage,
24+
AutoArticlePage,
25+
AutoJobPostingPage,
26+
AutoProductListPage,
27+
AutoProductNavigationPage,
28+
AutoProductPage,
2029
Item,
2130
JobPosting,
2231
Product,
2332
ProductList,
2433
ProductNavigation,
2534
)
35+
from zyte_common_items.fields import is_auto_field
2636

2737
from scrapy_zyte_api import Actions, ExtractFrom, Geolocation, Screenshot
2838
from scrapy_zyte_api._annotations import _ActionResult
@@ -35,6 +45,26 @@
3545
NO_CALLBACK = None
3646

3747

48+
_ITEM_KEYWORDS: Dict[type, str] = {
49+
Product: "product",
50+
ProductList: "productList",
51+
ProductNavigation: "productNavigation",
52+
Article: "article",
53+
ArticleList: "articleList",
54+
ArticleNavigation: "articleNavigation",
55+
JobPosting: "jobPosting",
56+
}
57+
_AUTO_PAGES: Set[type] = {
58+
AutoArticlePage,
59+
AutoArticleListPage,
60+
AutoArticleNavigationPage,
61+
AutoJobPostingPage,
62+
AutoProductPage,
63+
AutoProductListPage,
64+
AutoProductNavigationPage,
65+
}
66+
67+
3868
class ZyteApiProvider(PageObjectInputProvider):
3969
name = "zyte_api"
4070

@@ -54,9 +84,38 @@ class ZyteApiProvider(PageObjectInputProvider):
5484
Screenshot,
5585
}
5686

87+
def __init__(self, *args, **kwargs):
88+
super().__init__(*args, **kwargs)
89+
self._should_track_auto_fields = None
90+
self._tracked_auto_fields = set()
91+
5792
def is_provided(self, type_: Callable) -> bool:
5893
return super().is_provided(strip_annotated(type_))
5994

95+
def _track_auto_fields(self, crawler: Crawler, request: Request, cls: Type):
96+
if cls not in _ITEM_KEYWORDS:
97+
return
98+
if self._should_track_auto_fields is None:
99+
self._should_track_auto_fields = crawler.settings.getbool(
100+
"ZYTE_API_AUTO_FIELD_STATS", False
101+
)
102+
if self._should_track_auto_fields is False:
103+
return
104+
cls = self.injector.registry.page_cls_for_item(request.url, cls) or cls
105+
if cls in self._tracked_auto_fields:
106+
return
107+
self._tracked_auto_fields.add(cls)
108+
if cls in _ITEM_KEYWORDS:
109+
field_list = "(all fields)"
110+
else:
111+
auto_fields = set()
112+
for field_name in get_fields_dict(cls):
113+
if is_auto_field(cls, field_name): # type: ignore[arg-type]
114+
auto_fields.add(field_name)
115+
field_list = " ".join(sorted(auto_fields))
116+
cls_fqn = get_fq_class_name(cls)
117+
crawler.stats.set_value(f"scrapy-zyte-api/auto_fields/{cls_fqn}", field_list)
118+
60119
async def __call__( # noqa: C901
61120
self, to_provide: Set[Callable], request: Request, crawler: Crawler
62121
) -> Sequence[Any]:
@@ -66,6 +125,7 @@ async def __call__( # noqa: C901
66125
http_response = None
67126
screenshot_requested = Screenshot in to_provide
68127
for cls in list(to_provide):
128+
self._track_auto_fields(crawler, request, cast(type, cls))
69129
item = self.injector.weak_cache.get(request, {}).get(cls)
70130
if item:
71131
results.append(item)
@@ -89,15 +149,6 @@ async def __call__( # noqa: C901
89149
return results
90150

91151
html_requested = BrowserResponse in to_provide or BrowserHtml in to_provide
92-
item_keywords: Dict[type, str] = {
93-
Product: "product",
94-
ProductList: "productList",
95-
ProductNavigation: "productNavigation",
96-
Article: "article",
97-
ArticleList: "articleList",
98-
ArticleNavigation: "articleNavigation",
99-
JobPosting: "jobPosting",
100-
}
101152

102153
zyte_api_meta = {
103154
**crawler.settings.getdict("ZYTE_API_PROVIDER_PARAMS"),
@@ -135,7 +186,7 @@ async def __call__( # noqa: C901
135186
}
136187
)
137188
continue
138-
kw = item_keywords.get(cls_stripped)
189+
kw = _ITEM_KEYWORDS.get(cls_stripped)
139190
if not kw:
140191
continue
141192
item_requested = True
@@ -165,7 +216,7 @@ async def __call__( # noqa: C901
165216
)
166217

167218
extract_from = None # type: ignore[assignment]
168-
for item_type, kw in item_keywords.items():
219+
for item_type, kw in _ITEM_KEYWORDS.items():
169220
options_name = f"{kw}Options"
170221
if item_type not in to_provide_stripped and options_name in zyte_api_meta:
171222
del zyte_api_meta[options_name]
@@ -271,7 +322,7 @@ async def __call__( # noqa: C901
271322
result = AnnotatedInstance(Actions(actions_result), cls.__metadata__) # type: ignore[attr-defined]
272323
results.append(result)
273324
continue
274-
kw = item_keywords.get(cls_stripped)
325+
kw = _ITEM_KEYWORDS.get(cls_stripped)
275326
if not kw:
276327
continue
277328
assert issubclass(cls_stripped, Item)

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def get_version():
3333
"andi>=0.6.0",
3434
"scrapy-poet>=0.22.3",
3535
"web-poet>=0.17.0",
36-
"zyte-common-items>=0.8.0",
36+
"zyte-common-items>=0.20.0",
3737
]
3838
},
3939
classifiers=[

0 commit comments

Comments
 (0)