Skip to content

Commit a49bdba

Browse files
authored
support infra issues and tasks with no groups in log parser intermittent detector. (#8881)
1 parent 865def0 commit a49bdba

File tree

2 files changed

+66
-9
lines changed

2 files changed

+66
-9
lines changed

tests/log_parser/test_store_failure_lines.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -310,9 +310,7 @@ def verify_classification_id(jobs, job1_fcid, job2_fcid):
310310

311311

312312
"""
313-
TODO: write tests for testing intermittents.py handling in the parser.
314-
* not supported yet: test infra/tooling error + 1x green - both green
315-
* test multiple push ids
313+
TODO: test multiple push ids
316314
"""
317315

318316

@@ -328,7 +326,7 @@ def test_infra_no_intermittent(activate_responses, hundred_job_blobs, mock_parse
328326

329327
# this will parse and check for intermittents
330328
mock_full_log_parser(job_logs, mock_parser)
331-
verify_classification_id(jobs, 1, 1)
329+
verify_classification_id(jobs, 1, 8)
332330

333331

334332
def test_infra_intermittent(activate_responses, hundred_job_blobs, mock_parser, create_jobs):

treeherder/log_parser/intermittents.py

Lines changed: 64 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,61 @@
11
import datetime
22

3-
from treeherder.model.models import Group, GroupStatus, Job, Push
3+
from treeherder.model.models import Group, GroupStatus, Job, JobLog, Push
4+
5+
6+
def _check_and_mark_infra(current_job, job_ids, push_ids):
7+
"""
8+
current_job - Job object of incoming job we are parsing
9+
job_ids - list of all job_ids found in previous query
10+
push_ids - ids of pushes we care about from previous query
11+
"""
12+
if current_job.result != "success":
13+
# if new job is broken, then only look on same push
14+
# otherwise it could be a new failure.
15+
push_ids = [current_job.push.id]
16+
17+
# look for all jobs in pushids matching current_job.job_type.name
18+
# if older are failing for "infra", then ensure same job is passing
19+
# if so mark as intermittent
20+
extra_jobs = JobLog.objects.filter(
21+
job__push__id__range=(push_ids[-1], push_ids[0]),
22+
job__push__repository__id=current_job.repository.id,
23+
job__job_type__name=current_job.job_type.name,
24+
job__failure_classification_id__in=[1, 6],
25+
status__in=(1, 2, 3), # ignore pending
26+
job__result__in=[
27+
"busted",
28+
"testfailed",
29+
"exception",
30+
"success",
31+
], # primarily ignore retry/usercancel
32+
).values(
33+
"job__id",
34+
"job__result",
35+
"job__failure_classification_id",
36+
)
37+
38+
if len(extra_jobs) == 0:
39+
return
40+
41+
# ensure 50% 'success' rate
42+
# success here means the task ran and produced groups | is success
43+
# jobs without groups (like marionette) will still get tallied properly here
44+
extra_failed = []
45+
for job in extra_jobs:
46+
if job["job__id"] not in job_ids and job["job__result"] != "success":
47+
extra_failed.append(job)
48+
49+
# look for failure rate > 50% and exit early
50+
if len(extra_failed) / len(extra_jobs) > 0.5:
51+
return
52+
53+
# any extra_jobs will be failures without groups (infra/timeout/etc.)
54+
# theoretically there could be many jobs here
55+
# mark extra_jobs as `intermittent_needs_classification`
56+
for job in extra_failed:
57+
if job["job__failure_classification_id"] not in [4, 8]:
58+
Job.objects.filter(id=job["job__id"]).update(failure_classification_id=8)
459

560

661
def check_and_mark_intermittent(job_id):
@@ -52,7 +107,7 @@ def check_and_mark_intermittent(job_id):
52107
job_logs__job__result__in=[
53108
"success",
54109
"testfailed",
55-
], # primarily ignore retry/usercancel
110+
], # primarily ignore retry/usercancel/unknown
56111
group_result__status__in=[GroupStatus.OK, GroupStatus.ERROR],
57112
)
58113
.values(
@@ -65,6 +120,11 @@ def check_and_mark_intermittent(job_id):
65120
.order_by("-job_logs__job__push__id")
66121
)
67122

123+
# If no groups, look for infra
124+
distinct_job_ids = list(set([f["job_logs__job__id"] for f in all_groups]))
125+
if len(distinct_job_ids) == 1:
126+
return _check_and_mark_infra(current_job, distinct_job_ids, ids)
127+
68128
mappings = {}
69129
for item in all_groups:
70130
jobname = item["job_logs__job__job_type__name"].strip("-cf")
@@ -146,10 +206,9 @@ def check_and_mark_intermittent(job_id):
146206
):
147207
target_job = Job.objects.filter(id=job)
148208

149-
# TODO: infra would be nice to detect, but in the case of no groups, our data set == []
150-
# edge case is all groups originally pass and then shutdown leaks cause 'testfailed'.
151-
# also we ignore infra/leaks that don't report group failures in errorsummary files
152209
if target_job[0].result != "success" and target_job[
153210
0
154211
].failure_classification_id not in [4, 8]:
155212
target_job.update(failure_classification_id=8)
213+
214+
return _check_and_mark_infra(current_job, distinct_job_ids, ids)

0 commit comments

Comments
 (0)