Skip to content

Commit 05ddbc0

Browse files
committed
initial migration of Burnzz/scrapy-loader-upkeep code
1 parent 37a63cd commit 05ddbc0

File tree

1 file changed

+80
-14
lines changed

1 file changed

+80
-14
lines changed

itemloaders/__init__.py

Lines changed: 80 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ class ItemLoader:
8686
default_input_processor = Identity()
8787
default_output_processor = Identity()
8888

89-
def __init__(self, item=None, selector=None, parent=None, **context):
89+
def __init__(self, item=None, selector=None, parent=None, stats=None, **context):
9090
self.selector = selector
9191
context.update(selector=selector)
9292
if item is None:
@@ -99,6 +99,14 @@ def __init__(self, item=None, selector=None, parent=None, **context):
9999
for field_name, value in item.items():
100100
self._values[field_name] += arg_to_iter(value)
101101

102+
# This is the new injected dependency that we'll be using as the main
103+
# functionality of this tool.
104+
self.stats = stats
105+
106+
# This keeps track of the position of the 'field' name that is being
107+
# loaded for a more accurate logging in the stats.
108+
self.field_tracker = defaultdict(int)
109+
102110
@property
103111
def _values(self):
104112
if self.parent is not None:
@@ -327,14 +335,15 @@ def add_xpath(self, field_name, xpath, *processors, **kw):
327335
loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)')
328336
329337
"""
330-
values = self._get_xpathvalues(xpath, **kw)
338+
self.field_tracker[f"{field_name}_xpath"] += 1
339+
values = self._get_xpathvalues(field_name, xpath, **kw)
331340
self.add_value(field_name, values, *processors, **kw)
332341

333342
def replace_xpath(self, field_name, xpath, *processors, **kw):
334343
"""
335344
Similar to :meth:`add_xpath` but replaces collected data instead of adding it.
336345
"""
337-
values = self._get_xpathvalues(xpath, **kw)
346+
values = self._get_xpathvalues(field_name, xpath, **kw)
338347
self.replace_value(field_name, values, *processors, **kw)
339348

340349
def get_xpath(self, xpath, *processors, **kw):
@@ -358,13 +367,11 @@ def get_xpath(self, xpath, *processors, **kw):
358367
loader.get_xpath('//p[@id="price"]', TakeFirst(), re='the price is (.*)')
359368
360369
"""
361-
values = self._get_xpathvalues(xpath, **kw)
370+
values = self._get_xpathvalues(None, xpath, **kw)
362371
return self.get_value(values, *processors, **kw)
363372

364-
def _get_xpathvalues(self, xpaths, **kw):
365-
self._check_selector_method()
366-
xpaths = arg_to_iter(xpaths)
367-
return flatten(self.selector.xpath(xpath).getall() for xpath in xpaths)
373+
def _get_xpathvalues(self, field_name, xpaths, **kw):
374+
return self.get_selector_values(field_name, xpaths, self.selector.xpath, **kw)
368375

369376
def add_css(self, field_name, css, *processors, **kw):
370377
"""
@@ -384,14 +391,15 @@ def add_css(self, field_name, css, *processors, **kw):
384391
# HTML snippet: <p id="price">the price is $1200</p>
385392
loader.add_css('price', 'p#price', re='the price is (.*)')
386393
"""
387-
values = self._get_cssvalues(css, **kw)
394+
self.field_tracker[f"{field_name}_css"] += 1
395+
values = self._get_cssvalues(field_name, css, **kw)
388396
self.add_value(field_name, values, *processors, **kw)
389397

390398
def replace_css(self, field_name, css, *processors, **kw):
391399
"""
392400
Similar to :meth:`add_css` but replaces collected data instead of adding it.
393401
"""
394-
values = self._get_cssvalues(css, **kw)
402+
values = self._get_cssvalues(field_name, css, **kw)
395403
self.replace_value(field_name, values, *processors, **kw)
396404

397405
def get_css(self, css, *processors, **kw):
@@ -414,10 +422,68 @@ def get_css(self, css, *processors, **kw):
414422
# HTML snippet: <p id="price">the price is $1200</p>
415423
loader.get_css('p#price', TakeFirst(), re='the price is (.*)')
416424
"""
417-
values = self._get_cssvalues(css, **kw)
425+
values = self._get_cssvalues(None, css, **kw)
418426
return self.get_value(values, *processors, **kw)
419427

420-
def _get_cssvalues(self, csss, **kw):
428+
def _get_cssvalues(self, field_name, csss, **kw):
429+
return self.get_selector_values(field_name, csss, self.selector.css, **kw)
430+
431+
def get_selector_values(self, field_name, selector_rules, selector, **kw):
432+
"""Provides an abstraction to _get_xpathvalues() and _get_cssvalues()
433+
since they share the same components.
434+
"""
435+
421436
self._check_selector_method()
422-
csss = arg_to_iter(csss)
423-
return flatten(self.selector.css(css).getall() for css in csss)
437+
438+
selector_type = selector.__name__ # either 'css' or 'xpath'
439+
440+
# The optional arg in methods like `add_css()` for context in stats
441+
name = kw.get("name")
442+
443+
# For every call of `add_css()` and `add_xpath()` this is incremented.
444+
# We'll use it as the base index of the position of the logged stats.
445+
index = self.field_tracker[f"{field_name}_{selector_type}"]
446+
447+
values = []
448+
for position, rule in enumerate(arg_to_iter(selector_rules), index):
449+
parsed_data = selector(rule).getall()
450+
values.append(parsed_data)
451+
self.write_to_stats(
452+
field_name, parsed_data, position, selector_type, name=name
453+
)
454+
return flatten(values)
455+
456+
def write_to_stats(
457+
self, field_name, parsed_data, position, selector_type, name=None
458+
):
459+
"""Responsible for logging the parser rules usage.
460+
461+
NOTES: It's hard to easily denote which parser rule hasn't produced any
462+
data for the entire crawl, since ItemLoaders essentially don't know
463+
when the spider is going to be closed, as well as it has many
464+
instantiations all throughout the code.
465+
466+
The implementation below where each missing parsed_data is being logged
467+
to the stat is clunky, but necessary. With this, we can only surmise
468+
that it's safe to remove parser fallback parser if it's all just
469+
'*/missing' in the stats.
470+
"""
471+
472+
if not self.stats or not field_name:
473+
return
474+
475+
parser_label = (
476+
f"parser/{self.loader_name}/{field_name}/{selector_type}/{position}"
477+
)
478+
479+
if name:
480+
parser_label += f"/{name}"
481+
482+
if parsed_data in (None, []):
483+
parser_label += "/missing"
484+
485+
self.stats.inc_value(parser_label)
486+
487+
@property
488+
def loader_name(self):
489+
return self.__class__.__name__

0 commit comments

Comments
 (0)