@@ -82,10 +82,10 @@ def list_active_alarms(namespace: str = "") -> str:
8282
8383@tool
8484def get_metric_statistics (
85- namespace : str ,
86- metric_name : str ,
87- dimensions : str ,
88- period_minutes : int = 30 ,
85+ namespace : str ,
86+ metric_name : str ,
87+ dimensions : str ,
88+ period_minutes : int = 30 ,
8989) -> str :
9090 """
9191 Retrieve CloudWatch metric statistics for the last N minutes.
@@ -101,7 +101,7 @@ def get_metric_statistics(
101101 JSON string with datapoints (timestamp, average, sum, unit).
102102 """
103103 cw = boto3 .client ("cloudwatch" , region_name = AWS_REGION )
104- end_time = datetime .datetime .utcnow ( )
104+ end_time = datetime .datetime .now ( datetime . timezone . utc )
105105 start_time = end_time - datetime .timedelta (minutes = period_minutes )
106106
107107 try :
@@ -136,10 +136,10 @@ def get_metric_statistics(
136136
137137@tool
138138def fetch_log_events (
139- log_group : str ,
140- filter_pattern : str = "ERROR" ,
141- minutes_back : int = 15 ,
142- max_events : int = 50 ,
139+ log_group : str ,
140+ filter_pattern : str = "ERROR" ,
141+ minutes_back : int = 15 ,
142+ max_events : int = 50 ,
143143) -> str :
144144 """
145145 Fetch recent CloudWatch Logs events matching a filter pattern.
@@ -154,7 +154,7 @@ def fetch_log_events(
154154 JSON string with matching log events including timestamp and message.
155155 """
156156 logs = boto3 .client ("logs" , region_name = AWS_REGION )
157- end_time = int (datetime .datetime .utcnow ( ).timestamp () * 1000 )
157+ end_time = int (datetime .datetime .now ( datetime . timezone . utc ).timestamp () * 1000 )
158158 start_time = end_time - (minutes_back * 60 * 1000 )
159159
160160 try :
@@ -168,7 +168,7 @@ def fetch_log_events(
168168 events = [
169169 {
170170 "timestamp" : str (
171- datetime .datetime .utcfromtimestamp (e ["timestamp" ] / 1000 )
171+ datetime .datetime .fromtimestamp (e ["timestamp" ] / 1000 , datetime . timezone . utc )
172172 ),
173173 "message" : e ["message" ].strip (),
174174 "stream" : e .get ("logStreamName" , "" ),
@@ -279,7 +279,7 @@ def helm_rollback(release: str, revision: int = 0, namespace: str = "default") -
279279
280280@tool
281281def helm_scale (
282- release : str , replicas : int , namespace : str = "default"
282+ release : str , replicas : int , namespace : str = "default"
283283) -> str :
284284 """
285285 Scale a Helm-managed deployment by patching replica count.
@@ -335,7 +335,7 @@ def post_incident_report(summary: str, severity: str = "P2") -> str:
335335 Returns:
336336 Confirmation of where the report was sent.
337337 """
338- timestamp = datetime .datetime .utcnow ( ).strftime ("%Y-%m-%d %H:%M UTC" )
338+ timestamp = datetime .datetime .now ( datetime . timezone . utc ).strftime ("%Y-%m-%d %H:%M UTC" ). replace ( "+00:00" , " " )
339339 report = (
340340 f"*[{ severity } ] SRE Incident Report — { timestamp } *\n \n { summary } "
341341 )
@@ -360,10 +360,10 @@ def post_incident_report(summary: str, severity: str = "P2") -> str:
360360
361361
362362# ---------------------------------------------------------------------------
363- # Sub-Agents
363+ # Sub-Agents wrapped as Tools (Agents-as-Tools pattern)
364364# ---------------------------------------------------------------------------
365365
366- cloudwatch_agent = Agent (
366+ _cloudwatch_agent = Agent (
367367 model = model ,
368368 system_prompt = """You are a CloudWatch Monitoring specialist.
369369Your job is to:
@@ -377,7 +377,7 @@ def post_incident_report(summary: str, severity: str = "P2") -> str:
377377 tools = [list_active_alarms , get_metric_statistics , fetch_log_events ],
378378)
379379
380- rca_agent = Agent (
380+ _rca_agent = Agent (
381381 model = model ,
382382 system_prompt = """You are a senior Site Reliability Engineer performing root cause analysis.
383383Given alarm data, metrics, and log snippets, your job is to:
@@ -391,7 +391,7 @@ def post_incident_report(summary: str, severity: str = "P2") -> str:
391391 tools = [],
392392)
393393
394- remediation_agent = Agent (
394+ _remediation_agent = Agent (
395395 model = model ,
396396 system_prompt = """You are a Kubernetes and Helm operations expert.
397397Given a root cause analysis, your job is to:
@@ -405,6 +405,55 @@ def post_incident_report(summary: str, severity: str = "P2") -> str:
405405 tools = [kubectl_get , kubectl_rollout_restart , helm_rollback , helm_scale ],
406406)
407407
408+
409+ @tool
410+ def cloudwatch_agent (task : str ) -> str :
411+ """
412+ Delegate a CloudWatch monitoring task to the specialist agent.
413+ Use this to list active alarms, fetch metric statistics, and pull error logs.
414+
415+ Args:
416+ task: Natural language description of the monitoring task to perform.
417+
418+ Returns:
419+ Structured summary of alarms, metrics, and log events found.
420+ """
421+ response = _cloudwatch_agent (task )
422+ return str (response )
423+
424+
425+ @tool
426+ def rca_agent (context : str ) -> str :
427+ """
428+ Delegate root cause analysis to the SRE specialist agent.
429+ Provide alarm data, metrics, and log snippets as context.
430+
431+ Args:
432+ context: Full context including alarm details, metric values, and log events.
433+
434+ Returns:
435+ Root cause analysis with severity rating and ranked remediation options.
436+ """
437+ response = _rca_agent (context )
438+ return str (response )
439+
440+
441+ @tool
442+ def remediation_agent (instructions : str ) -> str :
443+ """
444+ Delegate Kubernetes/Helm remediation to the operations specialist agent.
445+ Use this to inspect workloads and apply rollback, restart, or scaling actions.
446+
447+ Args:
448+ instructions: Root cause analysis and remediation instructions.
449+
450+ Returns:
451+ Confirmation of actions taken or dry-run command output.
452+ """
453+ response = _remediation_agent (instructions )
454+ return str (response )
455+
456+
408457# ---------------------------------------------------------------------------
409458# Supervisor Agent
410459# ---------------------------------------------------------------------------
@@ -414,9 +463,9 @@ def post_incident_report(summary: str, severity: str = "P2") -> str:
414463 system_prompt = """You are the SRE Incident Commander orchestrating an incident response.
415464
416465Follow this workflow:
417- 1. Delegate to the cloudwatch_agent to gather all alarm and metric data.
418- 2. Delegate to the rca_agent to perform root cause analysis on that data .
419- 3. Delegate to the remediation_agent to inspect workloads and apply a fix.
466+ 1. Call cloudwatch_agent to gather all alarm and metric data.
467+ 2. Call rca_agent with the gathered data to perform root cause analysis.
468+ 3. Call remediation_agent with the RCA findings to inspect workloads and apply a fix.
4204694. Synthesise findings into a final incident report and post it using the
421470 post_incident_report tool.
422471
@@ -426,8 +475,7 @@ def post_incident_report(summary: str, severity: str = "P2") -> str:
426475- What was done (remediation action)
427476- What to watch next (follow-up items)
428477""" ,
429- tools = [post_incident_report ],
430- agents = [cloudwatch_agent , rca_agent , remediation_agent ],
478+ tools = [cloudwatch_agent , rca_agent , remediation_agent , post_incident_report ],
431479)
432480
433481
@@ -459,4 +507,4 @@ def run_incident_response(trigger: str = "") -> None:
459507 import sys
460508
461509 user_trigger = " " .join (sys .argv [1 :]) if len (sys .argv ) > 1 else ""
462- run_incident_response (user_trigger )
510+ run_incident_response (user_trigger )
0 commit comments