1- from typing import Any , Callable , Dict , List , Optional , Sequence , Set
1+ from typing import Any , Callable , Dict , List , Optional , Sequence , Set , Type , cast
22
33from andi .typeutils import is_typing_annotated , strip_annotated
44from scrapy import Request
1313 HttpResponseHeaders ,
1414)
1515from web_poet .annotated import AnnotatedInstance
16+ from web_poet .fields import get_fields_dict
17+ from web_poet .utils import get_fq_class_name
1618from zyte_common_items import (
1719 Article ,
1820 ArticleList ,
1921 ArticleNavigation ,
22+ AutoArticleListPage ,
23+ AutoArticleNavigationPage ,
24+ AutoArticlePage ,
25+ AutoJobPostingPage ,
26+ AutoProductListPage ,
27+ AutoProductNavigationPage ,
28+ AutoProductPage ,
2029 Item ,
2130 JobPosting ,
2231 Product ,
2332 ProductList ,
2433 ProductNavigation ,
2534)
35+ from zyte_common_items .fields import is_auto_field
2636
2737from scrapy_zyte_api import Actions , ExtractFrom , Geolocation , Screenshot
2838from scrapy_zyte_api ._annotations import _ActionResult
3545 NO_CALLBACK = None
3646
3747
48+ _ITEM_KEYWORDS : Dict [type , str ] = {
49+ Product : "product" ,
50+ ProductList : "productList" ,
51+ ProductNavigation : "productNavigation" ,
52+ Article : "article" ,
53+ ArticleList : "articleList" ,
54+ ArticleNavigation : "articleNavigation" ,
55+ JobPosting : "jobPosting" ,
56+ }
57+ _AUTO_PAGES : Set [type ] = {
58+ AutoArticlePage ,
59+ AutoArticleListPage ,
60+ AutoArticleNavigationPage ,
61+ AutoJobPostingPage ,
62+ AutoProductPage ,
63+ AutoProductListPage ,
64+ AutoProductNavigationPage ,
65+ }
66+
67+
3868class ZyteApiProvider (PageObjectInputProvider ):
3969 name = "zyte_api"
4070
@@ -54,9 +84,38 @@ class ZyteApiProvider(PageObjectInputProvider):
5484 Screenshot ,
5585 }
5686
87+ def __init__ (self , * args , ** kwargs ):
88+ super ().__init__ (* args , ** kwargs )
89+ self ._should_track_auto_fields = None
90+ self ._tracked_auto_fields = set ()
91+
5792 def is_provided (self , type_ : Callable ) -> bool :
5893 return super ().is_provided (strip_annotated (type_ ))
5994
95+ def _track_auto_fields (self , crawler : Crawler , request : Request , cls : Type ):
96+ if cls not in _ITEM_KEYWORDS :
97+ return
98+ if self ._should_track_auto_fields is None :
99+ self ._should_track_auto_fields = crawler .settings .getbool (
100+ "ZYTE_API_AUTO_FIELD_STATS" , False
101+ )
102+ if self ._should_track_auto_fields is False :
103+ return
104+ cls = self .injector .registry .page_cls_for_item (request .url , cls ) or cls
105+ if cls in self ._tracked_auto_fields :
106+ return
107+ self ._tracked_auto_fields .add (cls )
108+ if cls in _ITEM_KEYWORDS :
109+ field_list = "(all fields)"
110+ else :
111+ auto_fields = set ()
112+ for field_name in get_fields_dict (cls ):
113+ if is_auto_field (cls , field_name ): # type: ignore[arg-type]
114+ auto_fields .add (field_name )
115+ field_list = " " .join (sorted (auto_fields ))
116+ cls_fqn = get_fq_class_name (cls )
117+ crawler .stats .set_value (f"scrapy-zyte-api/auto_fields/{ cls_fqn } " , field_list )
118+
60119 async def __call__ ( # noqa: C901
61120 self , to_provide : Set [Callable ], request : Request , crawler : Crawler
62121 ) -> Sequence [Any ]:
@@ -66,6 +125,7 @@ async def __call__( # noqa: C901
66125 http_response = None
67126 screenshot_requested = Screenshot in to_provide
68127 for cls in list (to_provide ):
128+ self ._track_auto_fields (crawler , request , cast (type , cls ))
69129 item = self .injector .weak_cache .get (request , {}).get (cls )
70130 if item :
71131 results .append (item )
@@ -89,15 +149,6 @@ async def __call__( # noqa: C901
89149 return results
90150
91151 html_requested = BrowserResponse in to_provide or BrowserHtml in to_provide
92- item_keywords : Dict [type , str ] = {
93- Product : "product" ,
94- ProductList : "productList" ,
95- ProductNavigation : "productNavigation" ,
96- Article : "article" ,
97- ArticleList : "articleList" ,
98- ArticleNavigation : "articleNavigation" ,
99- JobPosting : "jobPosting" ,
100- }
101152
102153 zyte_api_meta = {
103154 ** crawler .settings .getdict ("ZYTE_API_PROVIDER_PARAMS" ),
@@ -135,7 +186,7 @@ async def __call__( # noqa: C901
135186 }
136187 )
137188 continue
138- kw = item_keywords .get (cls_stripped )
189+ kw = _ITEM_KEYWORDS .get (cls_stripped )
139190 if not kw :
140191 continue
141192 item_requested = True
@@ -165,7 +216,7 @@ async def __call__( # noqa: C901
165216 )
166217
167218 extract_from = None # type: ignore[assignment]
168- for item_type , kw in item_keywords .items ():
219+ for item_type , kw in _ITEM_KEYWORDS .items ():
169220 options_name = f"{ kw } Options"
170221 if item_type not in to_provide_stripped and options_name in zyte_api_meta :
171222 del zyte_api_meta [options_name ]
@@ -271,7 +322,7 @@ async def __call__( # noqa: C901
271322 result = AnnotatedInstance (Actions (actions_result ), cls .__metadata__ ) # type: ignore[attr-defined]
272323 results .append (result )
273324 continue
274- kw = item_keywords .get (cls_stripped )
325+ kw = _ITEM_KEYWORDS .get (cls_stripped )
275326 if not kw :
276327 continue
277328 assert issubclass (cls_stripped , Item )
0 commit comments