Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

## Next release

### ✨ Improved

* [#55](https://github.com/sdss/lvmgort/pull/55) Better messaging for alerts unavailability. A resolution alert message is now sent when the alerts become available again.

### 🔧 Fixed

* Explicitly raise exceptions in `AGSet.reconnect()`.
Expand Down
4 changes: 2 additions & 2 deletions src/gort/devices/guider.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from __future__ import annotations

import asyncio
from datetime import UTC, datetime, timedelta
from datetime import datetime, timedelta, timezone
from functools import partial

from typing import TYPE_CHECKING
Expand Down Expand Up @@ -376,7 +376,7 @@ async def _monitor_task(self, timeout: float = 30):
# Remove NaN rows.
df = df.drop_nulls()

now = datetime.now(UTC)
now = datetime.now(timezone.utc)
time_range = now - timedelta(seconds=timeout)
time_data = df.filter(polars.col.time > time_range).sort("time")

Expand Down
69 changes: 54 additions & 15 deletions src/gort/overwatcher/alerts.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ class ActiveAlert(enum.Flag):
CAMERA_TEMPERATURE = enum.auto()
O2 = enum.auto()
E_STOPS = enum.auto()
ALERTS_UNAVAILABLE = enum.auto()
ALERTS_DATA_UNAVAILABLE = enum.auto()
DISCONNECTED = enum.auto()
DOME_LOCKED = enum.auto()
IDLE = enum.auto()
Expand All @@ -78,37 +78,75 @@ class ActiveAlert(enum.Flag):
NO_CLOSE = DOOR | E_STOPS | ENGINEERING_OVERRIDE


ACTIVE_ALARM_DESCRIPTIONS: dict[ActiveAlert, str] = {
ActiveAlert.HUMIDITY: "Humidity is above the threshold",
ActiveAlert.DEW_POINT: "Ambient temperature is below dew point temperature",
ActiveAlert.WIND: "Wind speed is above the threshold",
ActiveAlert.RAIN: "Rain detected",
ActiveAlert.DOOR: "Enclosure door is open",
ActiveAlert.CAMERA_TEMPERATURE: "Spec camera temperature is above the threshold",
ActiveAlert.O2: "O2 levels are below the safe range",
ActiveAlert.E_STOPS: "Emergency stops have been triggered",
ActiveAlert.ALERTS_DATA_UNAVAILABLE: "Alerts data is unavailable",
ActiveAlert.DISCONNECTED: "Connectivity lost",
ActiveAlert.DOME_LOCKED: "Dome is locked",
ActiveAlert.IDLE: "Overwatcher has been idle for too long",
ActiveAlert.ENGINEERING_OVERRIDE: "Engineering mode is enabled",
ActiveAlert.UNKNOWN: "Unknown alert",
}


class AlertsMonitorTask(OverwatcherModuleTask["AlertsOverwatcher"]):
"""Monitors the alerts state."""

name = "alerts_monitor"
keep_alive = True
restart_on_error = True

INTERVAL: float = 20
INTERVAL: float = 30
N_FAILURES: int = 5

async def task(self):
"""Updates the alerts data."""

n_failures: int = 0
last_error: str = "Undefined error"

while True:
try:
await self.update_alerts()

except Exception as err:
self.log.error(f"Failed to get alerts data: {decap(err)}")
self.log.error(f"Failed retriving alerts data: {decap(err)}")
n_failures += 1
last_error = str(err)

else:
self.module.last_updated = time()
self.module.unavailable = False

# Send a resolution message if needed.
if self.module.alerts_data_unavailable:
await self.module.notify(
"[RESOLVED]: Alerts data is now available.",
level="critical",
)

self.module.alerts_data_unavailable = False

n_failures = 0
last_error = "Undefined error"

finally:
if self.module.unavailable is False and n_failures >= 5:
if (
self.module.alerts_data_unavailable is False
and n_failures >= self.N_FAILURES
):
await self.module.notify(
"Alerts data is unavailable.",
"Failed to retrieve alerts data multiple times: "
f"{decap(last_error, add_period=True)}",
level="critical",
)
self.module.unavailable = True
self.module.alerts_data_unavailable = True

await asyncio.sleep(self.INTERVAL)

Expand All @@ -118,12 +156,12 @@ async def update_alerts(self):
data = await self.module.update_status()

if data is None:
raise ValueError("No alerts data available.")
raise ValueError("API /alerts response failed or returned no data.")

# For some very critical alerts, we require them to be not null (null here
# means no data was available or the API failed getting the alert data).
if data.rain is None or data.humidity_alert is None or data.wind_alert is None:
raise ValueError("Incomplete alerts data.")
raise ValueError("Incomplete weather data in API /alerts response.")


class AlertsOverwatcher(OverwatcherModule):
Expand All @@ -141,7 +179,8 @@ def __init__(self, *args, **kwargs):

self.last_updated: float = 0.0
self.idle_since: float = 0.0
self.unavailable: bool = False

self.alerts_data_unavailable: bool = False

async def is_safe(self) -> tuple[bool, ActiveAlert]:
"""Determines whether it is safe to open."""
Expand All @@ -159,17 +198,17 @@ async def is_safe(self) -> tuple[bool, ActiveAlert]:
else:
self.idle_since = 0

if self.unavailable:
if self.alerts_data_unavailable:
self.log.warning("Alerts data is unavailable.")
active_alerts |= ActiveAlert.ALERTS_UNAVAILABLE
active_alerts |= ActiveAlert.ALERTS_DATA_UNAVAILABLE
return False, active_alerts

if self.unavailable is False and time() - self.last_updated > 300:
if self.alerts_data_unavailable is False and time() - self.last_updated > 300:
# If the data is not unavailable but it has not been updated
# in the last 5 minutes, something is wrong. We mark it as unavailable.
self.log.warning("Alerts data has not been updated in the last 5 minutes.")
self.unavailable = True
active_alerts |= ActiveAlert.ALERTS_UNAVAILABLE
self.alerts_data_unavailable = True
active_alerts |= ActiveAlert.ALERTS_DATA_UNAVAILABLE
return False, active_alerts

if self.state.rain:
Expand Down
24 changes: 16 additions & 8 deletions src/gort/overwatcher/overwatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,17 @@ async def task(self):
)

if not ow.state.safe:
await self.handle_unsafe()
try:
await self.handle_unsafe()
except Exception as err:
error_message = decap(err, add_period=True)
await ow.notify(
f"Error handling unsafe conditions: {error_message} "
"Closing the dome as a last resort.",
level="error",
error=err,
)
await ow.dome.close(retry=True)

if not ow.ephemeris.is_night(mode="observer"):
# We use the observer mode to allow stopping some minutes
Expand Down Expand Up @@ -231,7 +241,7 @@ async def handle_unsafe(self):

try:
await ow.shutdown(
reason=f"Unsafe conditions detected: {alert_names}",
reason=f"Unsafe conditions detected - {alert_names}",
close_dome=close_dome,
disable_overwatcher=disable_overwatcher,
)
Expand Down Expand Up @@ -262,7 +272,7 @@ async def handle_daytime(self):
immediate=True,
block=True,
)
elif not self.overwatcher.observer.is_cancelling:
elif self.overwatcher.observer.is_cancelling:
# We have already cancelled the loop and are waiting for the
# current exposure to finish. We don't do anything.
return
Expand All @@ -271,7 +281,7 @@ async def handle_daytime(self):
# the current exposure but allow it to finish.
await self.overwatcher.observer.stop_observing(
immediate=False,
reason="daytime conditions detected",
reason="morning twilight reached",
)
return

Expand All @@ -286,7 +296,7 @@ async def handle_daytime(self):
# to do it (we still want to allow observers to enable it in the evening).
# The overwatcher is disabled when the SJD changes.
await self.overwatcher.shutdown(
reason="Daytime conditions detected.",
reason="Daytime conditions detected",
level="info",
close_dome=True,
retry=True,
Expand Down Expand Up @@ -562,9 +572,7 @@ async def shutdown(
if not reason:
message = "Triggering shutdown."
else:
if not reason.endswith("."):
reason += "."
message = f"Triggering shutdown. Reason: {decap(reason)}"
message = f"Triggering shutdown. Reason: {decap(reason, add_period=True)}"

await self.notify(message, level=level)

Expand Down
9 changes: 7 additions & 2 deletions src/gort/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -1146,7 +1146,7 @@ def _get_message(self, message: str):
return self.header.format(**locals()) + message


def decap(string: Any, add_period: bool = False):
def decap(string: Any, add_period: bool = False) -> str:
"""Decapitalises the first letter of a string."""

if not isinstance(string, str):
Expand All @@ -1155,7 +1155,12 @@ def decap(string: Any, add_period: bool = False):
if len(string) == 0:
return ""

decapped = str(string[0].lower() + string[1:])
first_word = string.split(" ")[0]
if len(first_word) > 1 and first_word.isupper():
# Do not decap acronyms.
decapped = string
else:
decapped = str(string[0].lower() + string[1:])

if not add_period:
return decapped
Expand Down
Loading