Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions spidermon/contrib/scrapy/monitors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
SPIDERMON_JOBS_COMPARISON_THRESHOLD,
SPIDERMON_ITEM_COUNT_INCREASE,
SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS,
SPIDERMON_JOBS_COMPARISON_ARGUMENTS,
)
from .suites import (
SpiderCloseMonitorSuite,
Expand Down
43 changes: 43 additions & 0 deletions spidermon/contrib/scrapy/monitors/monitors.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@
SPIDERMON_JOBS_COMPARISON_TAGS = "SPIDERMON_JOBS_COMPARISON_TAGS"
SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS = "SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS"
SPIDERMON_JOBS_COMPARISON_THRESHOLD = "SPIDERMON_JOBS_COMPARISON_THRESHOLD"
SPIDERMON_JOBS_COMPARISON_ARGUMENTS = "SPIDERMON_JOBS_COMPARISON_ARGUMENTS"
SPIDERMON_JOBS_COMPARISON_ARGUMENTS_ENABLED = (
"SPIDERMON_JOBS_COMPARISON_ARGUMENTS_ENABLED"
)
SPIDERMON_ITEM_COUNT_INCREASE = "SPIDERMON_ITEM_COUNT_INCREASE"


Expand Down Expand Up @@ -534,6 +538,13 @@ class ZyteJobsComparisonMonitor(BaseStatMonitor):
``SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS`` setting. The default value is ``()``,
which doesn't filter any job based on close_reason. To only consider successfully finished jobs,
use ``("finished", ) instead.``

You can also filter which jobs to compare based on the job arguments using the
``SPIDERMON_JOBS_COMPARISON_ARGUMENTS`` setting. It will filter any job based on spider_args.
The job that will have all the desired arguments will be processed.
Example {"debug_url": "https://www.google.com"} or {"is_full_crawl": True}
You can enable this filter by setting SPIDERMON_JOBS_COMPARISON_ARGUMENTS_ENABLED as True in the settings.
Otherwise, this filter will not be applied
"""

stat_name = "item_scraped_count"
Expand Down Expand Up @@ -565,6 +576,10 @@ def _get_jobs(self, states, number_of_jobs):
close_reasons = self.crawler.settings.getlist(
SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS, ()
)
args = self._get_args_to_filter()
args_enabled = self.crawler.settings.getbool(
SPIDERMON_JOBS_COMPARISON_ARGUMENTS_ENABLED, False
)

total_jobs = []
start = 0
Expand All @@ -584,6 +599,10 @@ def _get_jobs(self, states, number_of_jobs):
for job in current_jobs:
if close_reasons and job.get("close_reason") not in close_reasons:
continue

if args_enabled and not self._has_desired_args(job, args):
continue

total_jobs.append(job)

if len(current_jobs) < MAX_API_COUNT or len(total_jobs) >= number_of_jobs:
Expand Down Expand Up @@ -611,6 +630,30 @@ def _get_tags_to_filter(self):
tags_to_filter = set(desired_tags) & set(current_tags)
return list(sorted(tags_to_filter))

def _get_args_to_filter(self):
"""
Return a list of desired arguments to filter
"""
desired_args = self.crawler.settings.getdict(
SPIDERMON_JOBS_COMPARISON_ARGUMENTS
)
if not desired_args:
return {}

return desired_args

def _has_desired_args(self, job, args):
if not args and not job.get("spider_args"):
return True
elif not args and job.get("spider_args"):
return False

job_args = job["spider_args"].keys()
if not all(a in job_args for a in args):
return False

return args == job["spider_args"]

def get_threshold(self):
number_of_jobs = self.crawler.settings.getint(SPIDERMON_JOBS_COMPARISON)

Expand Down
129 changes: 128 additions & 1 deletion tests/contrib/scrapy/monitors/test_jobs_comparison_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
SPIDERMON_JOBS_COMPARISON_STATES,
SPIDERMON_JOBS_COMPARISON_TAGS,
SPIDERMON_JOBS_COMPARISON_THRESHOLD,
SPIDERMON_JOBS_COMPARISON_ARGUMENTS,
SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS,
ZyteJobsComparisonMonitor,
monitors,
Expand Down Expand Up @@ -39,7 +40,28 @@ def mock_suite(mock_jobs, monkeypatch):


def get_paginated_jobs(**kwargs):
return [Mock() for _ in range(kwargs["count"])]
mocked_job_meta = []
for _ in range(kwargs["count"]):
mocked_job_meta.append({"spider_args": {}})
return mocked_job_meta


def get_paginated_jobs_with_one_args(**kwargs):
mocked_job_meta = []
for _ in range(kwargs["count"]):
mocked_job_meta.append(
{"spider_args": {"args1": True}, "close_reason": "finished"}
)
return mocked_job_meta


def get_paginated_jobs_arg_finished(**kwargs):
mocked_job_meta = []
for _ in range(kwargs["count"]):
mocked_job_meta.append(
{"spider_args": {"finished": True}, "close_reason": "finished"}
)
return mocked_job_meta


def get_paginated_jobs_with_finished_close_reason(**kwargs):
Expand Down Expand Up @@ -160,6 +182,7 @@ def test_jobs_comparison_monitor_get_jobs():
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
monitor.data = Mock()
monitor.crawler.settings.getlist.return_value = None
monitor.crawler.settings.getbool.return_value = False
mock_client.spider.jobs.list = Mock(side_effect=get_paginated_jobs)

# Return exact number of jobs
Expand All @@ -176,6 +199,7 @@ def test_jobs_comparison_monitor_get_jobs():
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
monitor.data = Mock()
monitor.crawler.settings.getlist.return_value = None
monitor.crawler.settings.getbool.return_value = False
output = [Mock(), Mock()]
mock_client.spider.jobs.list = Mock(return_value=output)

Expand All @@ -192,6 +216,7 @@ def test_jobs_comparison_monitor_get_jobs():
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
monitor.data = Mock()
monitor.crawler.settings.getlist.return_value = None
monitor.crawler.settings.getbool.return_value = False
mock_client.spider.jobs.list = Mock(side_effect=get_paginated_jobs)

# Jobs bigger than 1000
Expand All @@ -208,6 +233,7 @@ def test_jobs_comparison_monitor_get_jobs():
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
monitor.data = Mock()
monitor.crawler.settings.getlist.return_value = ["finished"]
monitor.crawler.settings.getbool.return_value = False
mock_client.spider.jobs.list = Mock(
side_effect=get_paginated_jobs_with_finished_close_reason
)
Expand All @@ -225,6 +251,7 @@ def test_jobs_comparison_monitor_get_jobs():
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
monitor.data = Mock()
monitor.crawler.settings.getlist.return_value = ["finished"]
monitor.crawler.settings.getbool.return_value = False
mock_client.spider.jobs.list = Mock(
side_effect=get_paginated_jobs_with_cancel_close_reason
)
Expand All @@ -233,6 +260,106 @@ def test_jobs_comparison_monitor_get_jobs():
jobs = monitor._get_jobs(states=None, number_of_jobs=50)
assert len(jobs) == 0

mock_client = Mock()
with patch(
"spidermon.contrib.scrapy.monitors.monitors.Client"
) as mock_client_class:
mock_client_class.return_value = mock_client
monitor = TestZyteJobsComparisonMonitor()
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
monitor.data = Mock()
monitor.crawler.settings.getdict.return_value = {}
monitor.crawler.settings.getlist.return_value = None
monitor.crawler.settings.getbool.return_value = True
mock_client.spider.jobs.list = Mock(side_effect=get_paginated_jobs)

# Return exact number of jobs
jobs = monitor._get_jobs(states=None, number_of_jobs=50)
assert len(jobs) == 50
mock_client.spider.jobs.list.assert_called_once()

mock_client = Mock()
with patch(
"spidermon.contrib.scrapy.monitors.monitors.Client"
) as mock_client_class:
mock_client_class.return_value = mock_client
monitor = TestZyteJobsComparisonMonitor()
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
monitor.data = Mock()
monitor.crawler.settings.getdict.return_value = {"finished": True}
monitor.crawler.settings.getlist.return_value = ["finished"]
monitor.crawler.settings.getbool.return_value = True
mock_client.spider.jobs.list = Mock(side_effect=get_paginated_jobs_arg_finished)

# Return exact number of jobs
jobs = monitor._get_jobs(states=None, number_of_jobs=50)
assert len(jobs) == 50
mock_client.spider.jobs.list.assert_called_once()

mock_client = Mock()
with patch(
"spidermon.contrib.scrapy.monitors.monitors.Client"
) as mock_client_class:
mock_client_class.return_value = mock_client
monitor = TestZyteJobsComparisonMonitor()
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
monitor.data = Mock()
monitor.crawler.settings.getdict.return_value = {"finished": False}
monitor.crawler.settings.getlist.return_value = ["finished"]
monitor.crawler.settings.getbool.return_value = True
mock_client.spider.jobs.list = Mock(side_effect=get_paginated_jobs_arg_finished)

# Return 0 number of jobs as argument values did not matched
jobs = monitor._get_jobs(states=None, number_of_jobs=50)
assert len(jobs) == 0
mock_client.spider.jobs.list.assert_called_once()

mock_client = Mock()
with patch(
"spidermon.contrib.scrapy.monitors.monitors.Client"
) as mock_client_class:
mock_client_class.return_value = mock_client
monitor = TestZyteJobsComparisonMonitor()
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
monitor.data = Mock()

def mock_getlist(key, default=None):
data = {
SPIDERMON_JOBS_COMPARISON_CLOSE_REASONS: ["finished"],
}
return data.get(key, default)

monitor.crawler.settings = Mock()
monitor.crawler.settings.getlist.side_effect = mock_getlist
monitor.crawler.settings.getdict.return_value = {}
monitor.crawler.settings.getbool.return_value = True
mock_client.spider.jobs.list = Mock(side_effect=get_paginated_jobs_arg_finished)

# Return 0 number of jobs
jobs = monitor._get_jobs(states=None, number_of_jobs=5)
assert len(jobs) == 0
mock_client.spider.jobs.list.assert_called_once()

mock_client = Mock()
with patch(
"spidermon.contrib.scrapy.monitors.monitors.Client"
) as mock_client_class:
mock_client_class.return_value = mock_client
monitor = TestZyteJobsComparisonMonitor()
monitor._get_tags_to_filter = Mock(side_effect=lambda: None)
monitor.data = Mock()

monitor.crawler.settings = Mock()
monitor.crawler.settings.getlist.return_value = ["finished"]
monitor.crawler.settings.getdict.return_value = {"is_debug": False}
monitor.crawler.settings.getbool.return_value = True
mock_client.spider.jobs.list = Mock(side_effect=get_paginated_jobs_arg_finished)

# Return 0 number of jobs
jobs = monitor._get_jobs(states=None, number_of_jobs=5)
assert len(jobs) == 0
mock_client.spider.jobs.list.assert_called_once()


@pytest.mark.parametrize(
["item_count", "previous_counts", "threshold", "should_raise"],
Expand Down