From c0d50e0deefad4b44a647789c1e34c8c65e55252 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Fri, 10 Oct 2025 14:37:00 +0000 Subject: [PATCH] feat: Add metrics to retry_stuck_testcases cronjob This change introduces comprehensive metrics to the `retry_stuck_testcases` cronjob, allowing for better monitoring and analysis of its performance. The new metrics track the number of testcase candidates, as well as the various outcomes of the process, such as successful restarts and different reasons for skipping. The metrics are designed to be collected even during `--dry-run` executions, providing valuable insights without making any actual changes. This will help in identifying and diagnosing issues with the testcase analysis pipeline more effectively. --- .../_internal/cron/retry_stuck_tasks.py | 28 ++++++++++++++++--- .../_internal/metrics/monitoring_metrics.py | 12 ++++++++ 2 files changed, 36 insertions(+), 4 deletions(-) diff --git a/src/clusterfuzz/_internal/cron/retry_stuck_tasks.py b/src/clusterfuzz/_internal/cron/retry_stuck_tasks.py index 240e86d8ef7..9e3ad366498 100644 --- a/src/clusterfuzz/_internal/cron/retry_stuck_tasks.py +++ b/src/clusterfuzz/_internal/cron/retry_stuck_tasks.py @@ -36,6 +36,7 @@ from clusterfuzz._internal.datastore import data_types from clusterfuzz._internal.datastore import ndb_utils from clusterfuzz._internal.metrics import logs +from clusterfuzz._internal.metrics import monitoring_metrics RETRY_ATTEMPT_COUNT_KEY = 'retry_stuck_task_attempt_count' RETRY_LAST_ATTEMPT_TIME_KEY = 'retry_stuck_task_last_attempt_time' @@ -389,14 +390,30 @@ def _log_verbose_summary(total_from_query: int, results: CategorizedTestcases): an error for each testcase that has been given up on, escalating them for manual review and increasing data integrity visibility. """ + skipped_complete_count = len(results.skipped_as_complete) + monitoring_metrics.STUCK_TESTCASE_COUNT.increment_by( + skipped_complete_count, {'result': 'skipped_as_complete'}) + + skipped_invalid_job_count = len(results.skipped_for_invalid_job) + monitoring_metrics.STUCK_TESTCASE_COUNT.increment_by( + skipped_invalid_job_count, {'result': 'skipped_for_invalid_job'}) + + skipped_cooldown_count = len(results.skipped_for_cooldown) + monitoring_metrics.STUCK_TESTCASE_COUNT.increment_by( + skipped_cooldown_count, {'result': 'skipped_for_cooldown'}) + + skipped_max_attempts_count = len(results.skipped_max_attempts) + monitoring_metrics.STUCK_TESTCASE_COUNT.increment_by( + skipped_max_attempts_count, {'result': 'skipped_max_attempts'}) + logs.info(f""" Analysis phase complete. Total candidates from query: {total_from_query} ------------------------------------------------------------------- - Skipped (already complete): {len(results.skipped_as_complete)} - Skipped (job no longer exists): {len(results.skipped_for_invalid_job)} - Skipped (in cooldown period): {len(results.skipped_for_cooldown)} - Skipped (max attempts reached): {len(results.skipped_max_attempts)} + Skipped (already complete): {skipped_complete_count} + Skipped (job no longer exists): {skipped_invalid_job_count} + Skipped (in cooldown period): {skipped_cooldown_count} + Skipped (max attempts reached): {skipped_max_attempts_count} ------------------------------------------------------------------- Testcases to be restarted: {len(results.to_restart)} """) @@ -428,6 +445,7 @@ def main(args: list[str] | None = None): query = _get_stuck_testcase_candidates_query(config.stuck_deadline) all_candidates = list(query) + monitoring_metrics.STUCK_TESTCASE_CANDIDATE_COUNT.set(len(all_candidates)) categorized_results = filter_and_categorize_candidates(all_candidates, config) @@ -435,6 +453,8 @@ def main(args: list[str] | None = None): restarted_count = restart_analysis_for_testcases( categorized_results.to_restart, config) + monitoring_metrics.STUCK_TESTCASE_COUNT.increment_by( + restarted_count, {'result': 'restarted'}) logs.info(f"Stuck testcase recovery cron finished. {len(all_candidates)} " f"candidates analyzed, {restarted_count} " diff --git a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py index 10c2f675952..c95a2e55890 100644 --- a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py +++ b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py @@ -440,3 +440,15 @@ monitor.BooleanField('deploy_terraform'), monitor.StringField('clusterfuzz_version') ]) + +STUCK_TESTCASE_COUNT = monitor.CounterMetric( + 'stuck_testcase_count', + description='The number of stuck testcases processed by the cron job.', + field_spec=[ + monitor.StringField('result'), + ]) + +STUCK_TESTCASE_CANDIDATE_COUNT = monitor.GaugeMetric( + 'stuck_testcase_candidate_count', + description='The number of candidates for stuck testcases.', + field_spec=[])