1
- """ StatesAccountingAgent sends periodically numbers of jobs in various states for various
1
+ """ StatesAccountingAgent sends periodically numbers of jobs and pilots in various states for various
2
2
sites to the Monitoring system to create historical plots.
3
3
4
4
.. literalinclude:: ../ConfigTemplate.cfg
8
8
:caption: StatesAccountingAgent options
9
9
"""
10
10
from DIRAC import S_OK , S_ERROR
11
+ from DIRAC .ConfigurationSystem .Client .Helpers .Operations import Operations
11
12
from DIRAC .Core .Base .AgentModule import AgentModule
12
13
from DIRAC .Core .Utilities import Time
13
14
from DIRAC .AccountingSystem .Client .Types .WMSHistory import WMSHistory
14
15
from DIRAC .AccountingSystem .Client .DataStoreClient import DataStoreClient
15
16
from DIRAC .MonitoringSystem .Client .MonitoringReporter import MonitoringReporter
16
17
from DIRAC .WorkloadManagementSystem .DB .JobDB import JobDB
18
+ from DIRAC .WorkloadManagementSystem .DB .PilotAgentsDB import PilotAgentsDB
17
19
18
20
19
21
class StatesAccountingAgent (AgentModule ):
20
22
"""Agent that every 15 minutes will report
21
23
to the AccountingDB (MySQL) or the Monitoring DB (ElasticSearch), or both,
22
24
a snapshot of the JobDB.
25
+ Also sends a snapshot of PilotAgentsDB to Monitoring.
23
26
"""
24
27
28
+ # WMSHistory fields
25
29
__summaryKeyFieldsMapping = [
26
30
"Status" ,
27
31
"Site" ,
@@ -36,24 +40,29 @@ class StatesAccountingAgent(AgentModule):
36
40
__summaryValueFieldsMapping = ["Jobs" , "Reschedules" ]
37
41
__renameFieldsMapping = {"JobType" : "JobSplitType" }
38
42
43
+ # PilotsHistory fields
44
+ __pilotKeyFields = ["TaskQueueID" , "GridSite" , "GridType" , "Status" ]
45
+ __pilotValueFields = ["NumOfPilots" ]
46
+
39
47
def initialize (self ):
40
48
"""Standard initialization"""
41
49
# This agent will always loop every 15 minutes
42
50
self .am_setOption ("PollingTime" , 900 )
43
51
44
52
self .backends = self .am_getOption ("Backends" , "Accounting" ).replace (" " , "" ).split ("," )
45
- messageQueue = self .am_getOption ("MessageQueue" , "dirac.wmshistory" )
46
-
47
- self .log .info ("Committing to %s backend" % "and " .join (self .backends ))
53
+ self .monitoringEnabled = Operations ().getValue ("MonitoringEnabled" , False )
48
54
55
+ messageQueue = self .am_getOption ("MessageQueue" , "dirac.wmshistory" )
56
+ pilotMessageQueue = self .am_getOption ("MessageQueue" , "dirac.monitoring" )
49
57
self .datastores = {} # For storing the clients to Accounting and Monitoring
50
58
51
59
if "Accounting" in self .backends :
52
60
self .datastores ["Accounting" ] = DataStoreClient (retryGraceTime = 900 )
53
- if "Monitoring" in self .backends :
61
+ if "Monitoring" in self .backends or self . monitoringEnabled :
54
62
self .datastores ["Monitoring" ] = MonitoringReporter (
55
63
monitoringType = "WMSHistory" , failoverQueueName = messageQueue
56
64
)
65
+ self .pilotReporter = MonitoringReporter (monitoringType = "PilotsHistory" , failoverQueueName = pilotMessageQueue )
57
66
58
67
self .__jobDBFields = []
59
68
for field in self .__summaryKeyFieldsMapping :
@@ -66,27 +75,62 @@ def initialize(self):
66
75
67
76
def execute (self ):
68
77
"""Main execution method"""
69
- # Get the WMS Snapshot!
78
+
79
+ # PilotsHistory to Monitoring
80
+ if self .monitoringEnabled :
81
+ self .log .info ("Committing PilotsHistory to Monitoring" )
82
+ result = PilotAgentsDB .getSummarySnapshot (self .__pilotKeyFields )
83
+ now = Time .dateTime ()
84
+ if not result ["OK" ]:
85
+ self .log .error (
86
+ "Can't get the PilotAgentsDB summary" ,
87
+ "%s: won't commit PilotsHistory at this cycle" % result ["Message" ],
88
+ )
89
+ return S_ERROR ()
90
+
91
+ values = result ["Value" ][1 ]
92
+ for record in values :
93
+ record = record [1 :]
94
+ rD = {}
95
+ for iP , _ in enumerate (self .__pilotKeyFields ):
96
+ rD [self .__pilotKeyFields [iP ]] = record [iP ]
97
+ record = record [len (self .__pilotKeyFields ) :]
98
+ for iP , _ in enumerate (self .__pilotValueFields ):
99
+ rD [self .__pilotValueFields [iP ]] = int (record [iP ])
100
+ rD ["timestamp" ] = int (Time .toEpoch (now ))
101
+ self .log .debug ("Adding following PilotsHistory record to Reporter: \n " , rD )
102
+ self .pilotReporter .addRecord (rD )
103
+
104
+ self .log .info ("Committing to Monitoring..." )
105
+ result = self .pilotReporter .commit ()
106
+ if not result ["OK" ]:
107
+ self .log .error ("Could not commit to Monitoring" , result ["Message" ])
108
+ return result
109
+ self .log .verbose ("Done committing PilotsHistory to Monitoring" )
110
+
111
+ # WMSHistory to Monitoring or Accounting
112
+ self .log .info ("Committing WMSHistory to %s backend" % "and " .join (self .backends ))
70
113
result = JobDB ().getSummarySnapshot (self .__jobDBFields )
71
114
now = Time .dateTime ()
72
115
if not result ["OK" ]:
73
- self .log .error ("Can't get the JobDB summary" , "%s: won't commit at this cycle" % result ["Message" ])
116
+ self .log .error (
117
+ "Can't get the JobDB summary" , "%s: won't commit WMSHistory at this cycle" % result ["Message" ]
118
+ )
74
119
return S_ERROR ()
75
120
76
- # Now we try to commit
77
121
values = result ["Value" ][1 ]
78
122
79
- self .log .info ("Start sending records" )
123
+ self .log .info ("Start sending WMSHistory records" )
80
124
for record in values :
81
125
record = record [1 :]
82
126
rD = {}
83
127
for fV in self .__summaryDefinedFields :
84
128
rD [fV [0 ]] = fV [1 ]
85
- for iP in range ( len ( self .__summaryKeyFieldsMapping ) ):
129
+ for iP , _ in enumerate ( self .__summaryKeyFieldsMapping ):
86
130
fieldName = self .__summaryKeyFieldsMapping [iP ]
87
131
rD [self .__renameFieldsMapping .get (fieldName , fieldName )] = record [iP ]
88
132
record = record [len (self .__summaryKeyFieldsMapping ) :]
89
- for iP in range ( len ( self .__summaryValueFieldsMapping ) ):
133
+ for iP , _ in enumerate ( self .__summaryValueFieldsMapping ):
90
134
rD [self .__summaryValueFieldsMapping [iP ]] = int (record [iP ])
91
135
92
136
for backend in self .datastores :
@@ -101,16 +145,16 @@ def execute(self):
101
145
acWMS .setValuesFromDict (rD )
102
146
retVal = acWMS .checkValues ()
103
147
if not retVal ["OK" ]:
104
- self .log .error ("Invalid accounting record " , "%s -> %s" % (retVal ["Message" ], rD ))
148
+ self .log .error ("Invalid WMSHistory accounting record " , "%s -> %s" % (retVal ["Message" ], rD ))
105
149
else :
106
150
self .datastores ["Accounting" ].addRegister (acWMS )
107
151
108
152
for backend , datastore in self .datastores .items ():
109
- self .log .info ("Committing to %s backend" % backend )
153
+ self .log .info ("Committing WMSHistory records to %s backend" % backend )
110
154
result = datastore .commit ()
111
155
if not result ["OK" ]:
112
- self .log .error ("Couldn't commit WMS history to %s" % backend , result ["Message" ])
156
+ self .log .error ("Couldn't commit WMSHistory to %s" % backend , result ["Message" ])
113
157
return S_ERROR ()
114
- self .log .verbose ("Done committing to %s backend" % backend )
158
+ self .log .verbose ("Done committing WMSHistory to %s backend" % backend )
115
159
116
160
return S_OK ()
0 commit comments