@@ -86,7 +86,7 @@ class ItemLoader:
8686 default_input_processor = Identity ()
8787 default_output_processor = Identity ()
8888
89- def __init__ (self , item = None , selector = None , parent = None , ** context ):
89+ def __init__ (self , item = None , selector = None , parent = None , stats = None , ** context ):
9090 self .selector = selector
9191 context .update (selector = selector )
9292 if item is None :
@@ -99,6 +99,14 @@ def __init__(self, item=None, selector=None, parent=None, **context):
9999 for field_name , value in item .items ():
100100 self ._values [field_name ] += arg_to_iter (value )
101101
102+ # This is the new injected dependency that we'll be using as the main
103+ # functionality of this tool.
104+ self .stats = stats
105+
106+ # This keeps track of the position of the 'field' name that is being
107+ # loaded for a more accurate logging in the stats.
108+ self .field_tracker = defaultdict (int )
109+
102110 @property
103111 def _values (self ):
104112 if self .parent is not None :
@@ -327,14 +335,15 @@ def add_xpath(self, field_name, xpath, *processors, **kw):
327335 loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)')
328336
329337 """
330- values = self ._get_xpathvalues (xpath , ** kw )
338+ self .field_tracker [f"{ field_name } _xpath" ] += 1
339+ values = self ._get_xpathvalues (field_name , xpath , ** kw )
331340 self .add_value (field_name , values , * processors , ** kw )
332341
333342 def replace_xpath (self , field_name , xpath , * processors , ** kw ):
334343 """
335344 Similar to :meth:`add_xpath` but replaces collected data instead of adding it.
336345 """
337- values = self ._get_xpathvalues (xpath , ** kw )
346+ values = self ._get_xpathvalues (field_name , xpath , ** kw )
338347 self .replace_value (field_name , values , * processors , ** kw )
339348
340349 def get_xpath (self , xpath , * processors , ** kw ):
@@ -358,13 +367,11 @@ def get_xpath(self, xpath, *processors, **kw):
358367 loader.get_xpath('//p[@id="price"]', TakeFirst(), re='the price is (.*)')
359368
360369 """
361- values = self ._get_xpathvalues (xpath , ** kw )
370+ values = self ._get_xpathvalues (None , xpath , ** kw )
362371 return self .get_value (values , * processors , ** kw )
363372
364- def _get_xpathvalues (self , xpaths , ** kw ):
365- self ._check_selector_method ()
366- xpaths = arg_to_iter (xpaths )
367- return flatten (self .selector .xpath (xpath ).getall () for xpath in xpaths )
373+ def _get_xpathvalues (self , field_name , xpaths , ** kw ):
374+ return self .get_selector_values (field_name , xpaths , self .selector .xpath , ** kw )
368375
369376 def add_css (self , field_name , css , * processors , ** kw ):
370377 """
@@ -384,14 +391,15 @@ def add_css(self, field_name, css, *processors, **kw):
384391 # HTML snippet: <p id="price">the price is $1200</p>
385392 loader.add_css('price', 'p#price', re='the price is (.*)')
386393 """
387- values = self ._get_cssvalues (css , ** kw )
394+ self .field_tracker [f"{ field_name } _css" ] += 1
395+ values = self ._get_cssvalues (field_name , css , ** kw )
388396 self .add_value (field_name , values , * processors , ** kw )
389397
390398 def replace_css (self , field_name , css , * processors , ** kw ):
391399 """
392400 Similar to :meth:`add_css` but replaces collected data instead of adding it.
393401 """
394- values = self ._get_cssvalues (css , ** kw )
402+ values = self ._get_cssvalues (field_name , css , ** kw )
395403 self .replace_value (field_name , values , * processors , ** kw )
396404
397405 def get_css (self , css , * processors , ** kw ):
@@ -414,10 +422,68 @@ def get_css(self, css, *processors, **kw):
414422 # HTML snippet: <p id="price">the price is $1200</p>
415423 loader.get_css('p#price', TakeFirst(), re='the price is (.*)')
416424 """
417- values = self ._get_cssvalues (css , ** kw )
425+ values = self ._get_cssvalues (None , css , ** kw )
418426 return self .get_value (values , * processors , ** kw )
419427
420- def _get_cssvalues (self , csss , ** kw ):
428+ def _get_cssvalues (self , field_name , csss , ** kw ):
429+ return self .get_selector_values (field_name , csss , self .selector .css , ** kw )
430+
431+ def get_selector_values (self , field_name , selector_rules , selector , ** kw ):
432+ """Provides an abstraction to _get_xpathvalues() and _get_cssvalues()
433+ since they share the same components.
434+ """
435+
421436 self ._check_selector_method ()
422- csss = arg_to_iter (csss )
423- return flatten (self .selector .css (css ).getall () for css in csss )
437+
438+ selector_type = selector .__name__ # either 'css' or 'xpath'
439+
440+ # The optional arg in methods like `add_css()` for context in stats
441+ name = kw .get ("name" )
442+
443+ # For every call of `add_css()` and `add_xpath()` this is incremented.
444+ # We'll use it as the base index of the position of the logged stats.
445+ index = self .field_tracker [f"{ field_name } _{ selector_type } " ]
446+
447+ values = []
448+ for position , rule in enumerate (arg_to_iter (selector_rules ), index ):
449+ parsed_data = selector (rule ).getall ()
450+ values .append (parsed_data )
451+ self .write_to_stats (
452+ field_name , parsed_data , position , selector_type , name = name
453+ )
454+ return flatten (values )
455+
456+ def write_to_stats (
457+ self , field_name , parsed_data , position , selector_type , name = None
458+ ):
459+ """Responsible for logging the parser rules usage.
460+
461+ NOTES: It's hard to easily denote which parser rule hasn't produced any
462+ data for the entire crawl, since ItemLoaders essentially don't know
463+ when the spider is going to be closed, as well as it has many
464+ instantiations all throughout the code.
465+
466+ The implementation below where each missing parsed_data is being logged
467+ to the stat is clunky, but necessary. With this, we can only surmise
468+ that it's safe to remove parser fallback parser if it's all just
469+ '*/missing' in the stats.
470+ """
471+
472+ if not self .stats or not field_name :
473+ return
474+
475+ parser_label = (
476+ f"parser/{ self .loader_name } /{ field_name } /{ selector_type } /{ position } "
477+ )
478+
479+ if name :
480+ parser_label += f"/{ name } "
481+
482+ if parsed_data in (None , []):
483+ parser_label += "/missing"
484+
485+ self .stats .inc_value (parser_label )
486+
487+ @property
488+ def loader_name (self ):
489+ return self .__class__ .__name__
0 commit comments