33import os
44from dataclasses import dataclass
55import sys
6+ import logging
67
78import github
89from github import Github
@@ -24,6 +25,7 @@ class JobMetrics:
2425 status : int
2526 created_at_ns : int
2627 workflow_id : int
28+ workflow_name : str
2729
2830
2931@dataclass
@@ -43,40 +45,60 @@ def get_sampled_workflow_metrics(github_repo: github.Repository):
4345 Returns a list of GaugeMetric objects, containing the relevant metrics about
4446 the workflow
4547 """
48+ queued_job_counts = {}
49+ running_job_counts = {}
4650
4751 # Other states are available (pending, waiting, etc), but the meaning
4852 # is not documented (See #70540).
4953 # "queued" seems to be the info we want.
50- queued_workflow_count = len (
51- [
52- x
53- for x in github_repo .get_workflow_runs (status = "queued" )
54- if x .name in WORKFLOWS_TO_TRACK
55- ]
56- )
57- running_workflow_count = len (
58- [
59- x
60- for x in github_repo .get_workflow_runs (status = "in_progress" )
61- if x .name in WORKFLOWS_TO_TRACK
62- ]
63- )
54+ for queued_workflow in github_repo .get_workflow_runs (status = "queued" ):
55+ if queued_workflow .name not in WORKFLOWS_TO_TRACK :
56+ continue
57+ for queued_workflow_job in queued_workflow .jobs ():
58+ job_name = queued_workflow_job .name
59+ # Workflows marked as queued can potentially only have some jobs
60+ # queued, so make sure to also count jobs currently in progress.
61+ if queued_workflow_job .status == "queued" :
62+ if job_name not in queued_job_counts :
63+ queued_job_counts [job_name ] = 1
64+ else :
65+ queued_job_counts [job_name ] += 1
66+ elif queued_workflow_job .status == "in_progress" :
67+ if job_name not in running_job_counts :
68+ running_job_counts [job_name ] = 1
69+ else :
70+ running_job_counts [job_name ] += 1
71+
72+ for running_workflow in github_repo .get_workflow_runs (status = "in_progress" ):
73+ if running_workflow .name not in WORKFLOWS_TO_TRACK :
74+ continue
75+ for running_workflow_job in running_workflow .jobs ():
76+ job_name = running_workflow_job .name
77+ if running_workflow_job .status != "in_progress" :
78+ continue
79+
80+ if job_name not in running_job_counts :
81+ running_job_counts [job_name ] = 1
82+ else :
83+ running_job_counts [job_name ] += 1
6484
6585 workflow_metrics = []
66- workflow_metrics .append (
67- GaugeMetric (
68- "workflow_queue_size" ,
69- queued_workflow_count ,
70- time .time_ns (),
86+ for queued_job in queued_job_counts :
87+ workflow_metrics .append (
88+ GaugeMetric (
89+ f"workflow_queue_size_{ queued_job } " ,
90+ queued_job_counts [queued_job ],
91+ time .time_ns (),
92+ )
7193 )
72- )
73- workflow_metrics .append (
74- GaugeMetric (
75- "running_workflow_count" ,
76- running_workflow_count ,
77- time .time_ns (),
94+ for running_job in running_job_counts :
95+ workflow_metrics .append (
96+ GaugeMetric (
97+ f"running_workflow_count_{ running_job } " ,
98+ running_job_counts [running_job ],
99+ time .time_ns (),
100+ )
78101 )
79- )
80102 # Always send a hearbeat metric so we can monitor is this container is still able to log to Grafana.
81103 workflow_metrics .append (
82104 GaugeMetric ("metrics_container_heartbeat" , 1 , time .time_ns ())
@@ -157,7 +179,7 @@ def get_per_workflow_metrics(
157179 # longer in a testing state and we can directly assert the workflow
158180 # result.
159181 for step in workflow_job .steps :
160- if step .conclusion != "success" :
182+ if step .conclusion != "success" and step . conclusion != "skipped" :
161183 job_result = 0
162184 break
163185
@@ -171,6 +193,10 @@ def get_per_workflow_metrics(
171193 # in nanoseconds.
172194 created_at_ns = int (created_at .timestamp ()) * 10 ** 9
173195
196+ logging .info (
197+ f"Adding a job metric for job { workflow_job .id } in workflow { workflow_run .id } "
198+ )
199+
174200 workflow_metrics .append (
175201 JobMetrics (
176202 workflow_run .name + "-" + workflow_job .name ,
@@ -179,6 +205,7 @@ def get_per_workflow_metrics(
179205 job_result ,
180206 created_at_ns ,
181207 workflow_run .id ,
208+ workflow_run .name ,
182209 )
183210 )
184211
@@ -198,7 +225,7 @@ def upload_metrics(workflow_metrics, metrics_userid, api_key):
198225 """
199226
200227 if len (workflow_metrics ) == 0 :
201- print ("No metrics found to upload." , file = sys . stderr )
228+ logging . info ("No metrics found to upload." )
202229 return
203230
204231 metrics_batch = []
@@ -227,16 +254,12 @@ def upload_metrics(workflow_metrics, metrics_userid, api_key):
227254 )
228255
229256 if response .status_code < 200 or response .status_code >= 300 :
230- print (
231- f"Failed to submit data to Grafana: { response .status_code } " , file = sys .stderr
232- )
257+ logging .info (f"Failed to submit data to Grafana: { response .status_code } " )
233258
234259
235260def main ():
236261 # Authenticate with Github
237262 auth = Auth .Token (os .environ ["GITHUB_TOKEN" ])
238- github_object = Github (auth = auth )
239- github_repo = github_object .get_repo ("llvm/llvm-project" )
240263
241264 grafana_api_key = os .environ ["GRAFANA_API_KEY" ]
242265 grafana_metrics_userid = os .environ ["GRAFANA_METRICS_USERID" ]
@@ -248,24 +271,24 @@ def main():
248271 # Enter the main loop. Every five minutes we wake up and dump metrics for
249272 # the relevant jobs.
250273 while True :
274+ github_object = Github (auth = auth )
275+ github_repo = github_object .get_repo ("llvm/llvm-project" )
276+
251277 current_metrics = get_per_workflow_metrics (github_repo , workflows_to_track )
252278 current_metrics += get_sampled_workflow_metrics (github_repo )
253- # Always send a hearbeat metric so we can monitor is this container is still able to log to Grafana.
254- current_metrics .append (
255- GaugeMetric ("metrics_container_heartbeat" , 1 , time .time_ns ())
256- )
257279
258280 upload_metrics (current_metrics , grafana_metrics_userid , grafana_api_key )
259- print (f"Uploaded { len (current_metrics )} metrics" , file = sys . stderr )
281+ logging . info (f"Uploaded { len (current_metrics )} metrics" )
260282
261283 for workflow_metric in reversed (current_metrics ):
262284 if isinstance (workflow_metric , JobMetrics ):
263285 workflows_to_track [
264- workflow_metric .job_name
286+ workflow_metric .workflow_name
265287 ] = workflow_metric .workflow_id
266288
267289 time .sleep (SCRAPE_INTERVAL_SECONDS )
268290
269291
270292if __name__ == "__main__" :
293+ logging .basicConfig (level = logging .INFO )
271294 main ()
0 commit comments