Skip to content

Commit 6455a63

Browse files
Ayush JhaAyush Jha
authored andcommitted
feat(02-samples): fixed test issues
1 parent 1aba0a6 commit 6455a63

File tree

2 files changed

+79
-30
lines changed

2 files changed

+79
-30
lines changed

02-samples/19-sre-incident-response-agent/sre_agent.py

Lines changed: 71 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -82,10 +82,10 @@ def list_active_alarms(namespace: str = "") -> str:
8282

8383
@tool
8484
def get_metric_statistics(
85-
namespace: str,
86-
metric_name: str,
87-
dimensions: str,
88-
period_minutes: int = 30,
85+
namespace: str,
86+
metric_name: str,
87+
dimensions: str,
88+
period_minutes: int = 30,
8989
) -> str:
9090
"""
9191
Retrieve CloudWatch metric statistics for the last N minutes.
@@ -101,7 +101,7 @@ def get_metric_statistics(
101101
JSON string with datapoints (timestamp, average, sum, unit).
102102
"""
103103
cw = boto3.client("cloudwatch", region_name=AWS_REGION)
104-
end_time = datetime.datetime.utcnow()
104+
end_time = datetime.datetime.now(datetime.timezone.utc)
105105
start_time = end_time - datetime.timedelta(minutes=period_minutes)
106106

107107
try:
@@ -136,10 +136,10 @@ def get_metric_statistics(
136136

137137
@tool
138138
def fetch_log_events(
139-
log_group: str,
140-
filter_pattern: str = "ERROR",
141-
minutes_back: int = 15,
142-
max_events: int = 50,
139+
log_group: str,
140+
filter_pattern: str = "ERROR",
141+
minutes_back: int = 15,
142+
max_events: int = 50,
143143
) -> str:
144144
"""
145145
Fetch recent CloudWatch Logs events matching a filter pattern.
@@ -154,7 +154,7 @@ def fetch_log_events(
154154
JSON string with matching log events including timestamp and message.
155155
"""
156156
logs = boto3.client("logs", region_name=AWS_REGION)
157-
end_time = int(datetime.datetime.utcnow().timestamp() * 1000)
157+
end_time = int(datetime.datetime.now(datetime.timezone.utc).timestamp() * 1000)
158158
start_time = end_time - (minutes_back * 60 * 1000)
159159

160160
try:
@@ -168,7 +168,7 @@ def fetch_log_events(
168168
events = [
169169
{
170170
"timestamp": str(
171-
datetime.datetime.utcfromtimestamp(e["timestamp"] / 1000)
171+
datetime.datetime.fromtimestamp(e["timestamp"] / 1000, datetime.timezone.utc)
172172
),
173173
"message": e["message"].strip(),
174174
"stream": e.get("logStreamName", ""),
@@ -279,7 +279,7 @@ def helm_rollback(release: str, revision: int = 0, namespace: str = "default") -
279279

280280
@tool
281281
def helm_scale(
282-
release: str, replicas: int, namespace: str = "default"
282+
release: str, replicas: int, namespace: str = "default"
283283
) -> str:
284284
"""
285285
Scale a Helm-managed deployment by patching replica count.
@@ -335,7 +335,7 @@ def post_incident_report(summary: str, severity: str = "P2") -> str:
335335
Returns:
336336
Confirmation of where the report was sent.
337337
"""
338-
timestamp = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M UTC")
338+
timestamp = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d %H:%M UTC").replace("+00:00", "")
339339
report = (
340340
f"*[{severity}] SRE Incident Report — {timestamp}*\n\n{summary}"
341341
)
@@ -360,10 +360,10 @@ def post_incident_report(summary: str, severity: str = "P2") -> str:
360360

361361

362362
# ---------------------------------------------------------------------------
363-
# Sub-Agents
363+
# Sub-Agents wrapped as Tools (Agents-as-Tools pattern)
364364
# ---------------------------------------------------------------------------
365365

366-
cloudwatch_agent = Agent(
366+
_cloudwatch_agent = Agent(
367367
model=model,
368368
system_prompt="""You are a CloudWatch Monitoring specialist.
369369
Your job is to:
@@ -377,7 +377,7 @@ def post_incident_report(summary: str, severity: str = "P2") -> str:
377377
tools=[list_active_alarms, get_metric_statistics, fetch_log_events],
378378
)
379379

380-
rca_agent = Agent(
380+
_rca_agent = Agent(
381381
model=model,
382382
system_prompt="""You are a senior Site Reliability Engineer performing root cause analysis.
383383
Given alarm data, metrics, and log snippets, your job is to:
@@ -391,7 +391,7 @@ def post_incident_report(summary: str, severity: str = "P2") -> str:
391391
tools=[],
392392
)
393393

394-
remediation_agent = Agent(
394+
_remediation_agent = Agent(
395395
model=model,
396396
system_prompt="""You are a Kubernetes and Helm operations expert.
397397
Given a root cause analysis, your job is to:
@@ -405,6 +405,55 @@ def post_incident_report(summary: str, severity: str = "P2") -> str:
405405
tools=[kubectl_get, kubectl_rollout_restart, helm_rollback, helm_scale],
406406
)
407407

408+
409+
@tool
410+
def cloudwatch_agent(task: str) -> str:
411+
"""
412+
Delegate a CloudWatch monitoring task to the specialist agent.
413+
Use this to list active alarms, fetch metric statistics, and pull error logs.
414+
415+
Args:
416+
task: Natural language description of the monitoring task to perform.
417+
418+
Returns:
419+
Structured summary of alarms, metrics, and log events found.
420+
"""
421+
response = _cloudwatch_agent(task)
422+
return str(response)
423+
424+
425+
@tool
426+
def rca_agent(context: str) -> str:
427+
"""
428+
Delegate root cause analysis to the SRE specialist agent.
429+
Provide alarm data, metrics, and log snippets as context.
430+
431+
Args:
432+
context: Full context including alarm details, metric values, and log events.
433+
434+
Returns:
435+
Root cause analysis with severity rating and ranked remediation options.
436+
"""
437+
response = _rca_agent(context)
438+
return str(response)
439+
440+
441+
@tool
442+
def remediation_agent(instructions: str) -> str:
443+
"""
444+
Delegate Kubernetes/Helm remediation to the operations specialist agent.
445+
Use this to inspect workloads and apply rollback, restart, or scaling actions.
446+
447+
Args:
448+
instructions: Root cause analysis and remediation instructions.
449+
450+
Returns:
451+
Confirmation of actions taken or dry-run command output.
452+
"""
453+
response = _remediation_agent(instructions)
454+
return str(response)
455+
456+
408457
# ---------------------------------------------------------------------------
409458
# Supervisor Agent
410459
# ---------------------------------------------------------------------------
@@ -414,9 +463,9 @@ def post_incident_report(summary: str, severity: str = "P2") -> str:
414463
system_prompt="""You are the SRE Incident Commander orchestrating an incident response.
415464
416465
Follow this workflow:
417-
1. Delegate to the cloudwatch_agent to gather all alarm and metric data.
418-
2. Delegate to the rca_agent to perform root cause analysis on that data.
419-
3. Delegate to the remediation_agent to inspect workloads and apply a fix.
466+
1. Call cloudwatch_agent to gather all alarm and metric data.
467+
2. Call rca_agent with the gathered data to perform root cause analysis.
468+
3. Call remediation_agent with the RCA findings to inspect workloads and apply a fix.
420469
4. Synthesise findings into a final incident report and post it using the
421470
post_incident_report tool.
422471
@@ -426,8 +475,7 @@ def post_incident_report(summary: str, severity: str = "P2") -> str:
426475
- What was done (remediation action)
427476
- What to watch next (follow-up items)
428477
""",
429-
tools=[post_incident_report],
430-
agents=[cloudwatch_agent, rca_agent, remediation_agent],
478+
tools=[cloudwatch_agent, rca_agent, remediation_agent, post_incident_report],
431479
)
432480

433481

@@ -459,4 +507,4 @@ def run_incident_response(trigger: str = "") -> None:
459507
import sys
460508

461509
user_trigger = " ".join(sys.argv[1:]) if len(sys.argv) > 1 else ""
462-
run_incident_response(user_trigger)
510+
run_incident_response(user_trigger)

02-samples/19-sre-incident-response-agent/test_sre_agent.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -212,13 +212,14 @@ def test_prints_to_stdout_when_no_webhook(self, capsys):
212212
assert "P3" in captured.out
213213

214214
@patch("sre_agent.SLACK_WEBHOOK_URL", "https://hooks.slack.com/fake")
215-
@patch("sre_agent.urllib.request.urlopen")
216-
@patch("sre_agent.urllib.request.Request")
217-
def test_posts_to_slack_when_webhook_set(self, mock_req, mock_urlopen):
215+
def test_posts_to_slack_when_webhook_set(self):
218216
mock_resp = MagicMock()
219217
mock_resp.status = 200
220-
mock_urlopen.return_value.__enter__ = lambda s: mock_resp
221-
mock_urlopen.return_value.__exit__ = MagicMock(return_value=False)
222218

223-
result = post_incident_report("Critical incident", severity="P1")
224-
assert "Slack" in result or "200" in result
219+
with patch("urllib.request.Request"), \
220+
patch("urllib.request.urlopen") as mock_urlopen:
221+
mock_urlopen.return_value.__enter__ = lambda s: mock_resp
222+
mock_urlopen.return_value.__exit__ = MagicMock(return_value=False)
223+
224+
result = post_incident_report("Critical incident", severity="P1")
225+
assert "Slack" in result or "200" in result

0 commit comments

Comments
 (0)