Skip to content

Commit 5b8baea

Browse files
[DPE-6652] Refresh all charm libs (#225)
1 parent 9d8010f commit 5b8baea

File tree

6 files changed

+639
-275
lines changed

6 files changed

+639
-275
lines changed

lib/charms/data_platform_libs/v0/data_interfaces.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -331,7 +331,7 @@ def _on_topic_requested(self, event: TopicRequestedEvent):
331331

332332
# Increment this PATCH version before using `charmcraft publish-lib` or reset
333333
# to 0 if you are raising the major API version
334-
LIBPATCH = 40
334+
LIBPATCH = 41
335335

336336
PYDEPS = ["ops>=2.0.0"]
337337

@@ -609,7 +609,7 @@ def get_group(self, group: str) -> Optional[SecretGroup]:
609609
class CachedSecret:
610610
"""Locally cache a secret.
611611
612-
The data structure is precisely re-using/simulating as in the actual Secret Storage
612+
The data structure is precisely reusing/simulating as in the actual Secret Storage
613613
"""
614614

615615
KNOWN_MODEL_ERRORS = [MODEL_ERRORS["no_label_and_uri"], MODEL_ERRORS["owner_no_refresh"]]
@@ -2363,7 +2363,6 @@ def _update_relation_data(self, relation: Relation, data: Dict[str, str]) -> Non
23632363
def _delete_relation_data(self, relation: Relation, fields: List[str]) -> None:
23642364
"""Delete data available (directily or indirectly -- i.e. secrets) from the relation for owner/this_app."""
23652365
if self.secret_fields and self.deleted_label:
2366-
23672366
_, normal_fields = self._process_secret_fields(
23682367
relation,
23692368
self.secret_fields,

lib/charms/grafana_agent/v0/cos_agent.py

Lines changed: 115 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
- `COSAgentProvider`: Use in machine charms that need to have a workload's metrics
99
or logs scraped, or forward rule files or dashboards to Prometheus, Loki or Grafana through
1010
the Grafana Agent machine charm.
11+
NOTE: Be sure to add `limit: 1` in your charm for the cos-agent relation. That is the only
12+
way we currently have to prevent two different grafana agent apps deployed on the same VM.
1113
1214
- `COSAgentConsumer`: Used in the Grafana Agent machine charm to manage the requirer side of
1315
the `cos_agent` interface.
@@ -22,7 +24,6 @@
2224
Using the `COSAgentProvider` object only requires instantiating it,
2325
typically in the `__init__` method of your charm (the one which sends telemetry).
2426
25-
The constructor of `COSAgentProvider` has only one required and ten optional parameters:
2627
2728
```python
2829
def __init__(
@@ -233,8 +234,8 @@ def __init__(self, *args):
233234
)
234235

235236
import pydantic
236-
from cosl import GrafanaDashboard, JujuTopology
237-
from cosl.rules import AlertRules
237+
from cosl import DashboardPath40UID, JujuTopology, LZMABase64
238+
from cosl.rules import AlertRules, generic_alert_groups
238239
from ops.charm import RelationChangedEvent
239240
from ops.framework import EventBase, EventSource, Object, ObjectEvents
240241
from ops.model import ModelError, Relation
@@ -253,9 +254,9 @@ class _MetricsEndpointDict(TypedDict):
253254

254255
LIBID = "dc15fa84cef84ce58155fb84f6c6213a"
255256
LIBAPI = 0
256-
LIBPATCH = 11
257+
LIBPATCH = 20
257258

258-
PYDEPS = ["cosl", "pydantic"]
259+
PYDEPS = ["cosl >= 0.0.50", "pydantic"]
259260

260261
DEFAULT_RELATION_NAME = "cos-agent"
261262
DEFAULT_PEER_RELATION_NAME = "peers"
@@ -267,7 +268,6 @@ class _MetricsEndpointDict(TypedDict):
267268
logger = logging.getLogger(__name__)
268269
SnapEndpoint = namedtuple("SnapEndpoint", "owner, name")
269270

270-
271271
# Note: MutableMapping is imported from the typing module and not collections.abc
272272
# because subscripting collections.abc.MutableMapping was added in python 3.9, but
273273
# most of our charms are based on 20.04, which has python 3.8.
@@ -317,7 +317,11 @@ class NotReadyError(TracingError):
317317
"""Raised by the provider wrapper if a requirer hasn't published the required data (yet)."""
318318

319319

320-
class ProtocolNotRequestedError(TracingError):
320+
class ProtocolNotFoundError(TracingError):
321+
"""Raised if the user doesn't receive an endpoint for a protocol it requested."""
322+
323+
324+
class ProtocolNotRequestedError(ProtocolNotFoundError):
321325
"""Raised if the user attempts to obtain an endpoint for a protocol it did not request."""
322326

323327

@@ -476,7 +480,7 @@ class CosAgentProviderUnitData(DatabagModel):
476480
# this needs to make its way to the gagent leader
477481
metrics_alert_rules: dict
478482
log_alert_rules: dict
479-
dashboards: List[GrafanaDashboard]
483+
dashboards: List[str]
480484
# subordinate is no longer used but we should keep it until we bump the library to ensure
481485
# we don't break compatibility.
482486
subordinate: Optional[bool] = None
@@ -509,7 +513,7 @@ class CosAgentPeersUnitData(DatabagModel):
509513
# of the outgoing o11y relations.
510514
metrics_alert_rules: Optional[dict]
511515
log_alert_rules: Optional[dict]
512-
dashboards: Optional[List[GrafanaDashboard]]
516+
dashboards: Optional[List[str]]
513517

514518
# when this whole datastructure is dumped into a databag, it will be nested under this key.
515519
# while not strictly necessary (we could have it 'flattened out' into the databag),
@@ -579,7 +583,7 @@ class Receiver(pydantic.BaseModel):
579583
"""Specification of an active receiver."""
580584

581585
protocol: ProtocolType = pydantic.Field(..., description="Receiver protocol name and type.")
582-
url: str = pydantic.Field(
586+
url: Optional[str] = pydantic.Field(
583587
...,
584588
description="""URL at which the receiver is reachable. If there's an ingress, it would be the external URL.
585589
Otherwise, it would be the service's fqdn or internal IP.
@@ -727,6 +731,10 @@ def _metrics_alert_rules(self) -> Dict:
727731
query_type="promql", topology=JujuTopology.from_charm(self._charm)
728732
)
729733
alert_rules.add_path(self._metrics_rules, recursive=self._recursive)
734+
alert_rules.add(
735+
generic_alert_groups.application_rules,
736+
group_name_prefix=JujuTopology.from_charm(self._charm).identifier,
737+
)
730738
return alert_rules.as_dict()
731739

732740
@property
@@ -737,12 +745,27 @@ def _log_alert_rules(self) -> Dict:
737745
return alert_rules.as_dict()
738746

739747
@property
740-
def _dashboards(self) -> List[GrafanaDashboard]:
741-
dashboards: List[GrafanaDashboard] = []
748+
def _dashboards(self) -> List[str]:
749+
dashboards: List[str] = []
742750
for d in self._dashboard_dirs:
743751
for path in Path(d).glob("*"):
744-
dashboard = GrafanaDashboard._serialize(path.read_bytes())
745-
dashboards.append(dashboard)
752+
with open(path, "rt") as fp:
753+
dashboard = json.load(fp)
754+
rel_path = str(
755+
path.relative_to(self._charm.charm_dir) if path.is_absolute() else path
756+
)
757+
# COSAgentProvider is somewhat analogous to GrafanaDashboardProvider. We need to overwrite the uid here
758+
# because there is currently no other way to communicate the dashboard path separately.
759+
# https://github.com/canonical/grafana-k8s-operator/pull/363
760+
dashboard["uid"] = DashboardPath40UID.generate(self._charm.meta.name, rel_path)
761+
762+
# Add tags
763+
tags: List[str] = dashboard.get("tags", [])
764+
if not any(tag.startswith("charm: ") for tag in tags):
765+
tags.append(f"charm: {self._charm.meta.name}")
766+
dashboard["tags"] = tags
767+
768+
dashboards.append(LZMABase64.compress(json.dumps(dashboard)))
746769
return dashboards
747770

748771
@property
@@ -768,7 +791,7 @@ def is_ready(self, relation: Optional[Relation] = None):
768791
"""Is this endpoint ready?"""
769792
relation = relation or self._relation
770793
if not relation:
771-
logger.debug(f"no relation on {self._relation_name !r}: tracing not ready")
794+
logger.debug(f"no relation on {self._relation_name!r}: tracing not ready")
772795
return False
773796
if relation.data is None:
774797
logger.error(f"relation data is None for {relation}")
@@ -802,29 +825,48 @@ def get_all_endpoints(
802825

803826
def _get_tracing_endpoint(
804827
self, relation: Optional[Relation], protocol: ReceiverProtocol
805-
) -> Optional[str]:
828+
) -> str:
829+
"""Return a tracing endpoint URL if it is available or raise a ProtocolNotFoundError."""
806830
unit_data = self.get_all_endpoints(relation)
807831
if not unit_data:
808-
return None
832+
# we didn't find the protocol because the remote end didn't publish any data yet
833+
# it might also mean that grafana-agent doesn't have a relation to the tracing backend
834+
raise ProtocolNotFoundError(protocol)
809835
receivers: List[Receiver] = [i for i in unit_data.receivers if i.protocol.name == protocol]
810836
if not receivers:
811-
logger.error(f"no receiver found with protocol={protocol!r}")
812-
return None
837+
# we didn't find the protocol because grafana-agent didn't return us the protocol that we requested
838+
# the caller might want to verify that we did indeed request this protocol
839+
raise ProtocolNotFoundError(protocol)
813840
if len(receivers) > 1:
814-
logger.error(
841+
logger.warning(
815842
f"too many receivers with protocol={protocol!r}; using first one. Found: {receivers}"
816843
)
817-
return None
818844

819845
receiver = receivers[0]
846+
if not receiver.url:
847+
# grafana-agent isn't connected to the tracing backend yet
848+
raise ProtocolNotFoundError(protocol)
820849
return receiver.url
821850

822851
def get_tracing_endpoint(
823852
self, protocol: ReceiverProtocol, relation: Optional[Relation] = None
824-
) -> Optional[str]:
825-
"""Receiver endpoint for the given protocol."""
826-
endpoint = self._get_tracing_endpoint(relation or self._relation, protocol=protocol)
827-
if not endpoint:
853+
) -> str:
854+
"""Receiver endpoint for the given protocol.
855+
856+
It could happen that this function gets called before the provider publishes the endpoints.
857+
In such a scenario, if a non-leader unit calls this function, a permission denied exception will be raised due to
858+
restricted access. To prevent this, this function needs to be guarded by the `is_ready` check.
859+
860+
Raises:
861+
ProtocolNotRequestedError:
862+
If the charm unit is the leader unit and attempts to obtain an endpoint for a protocol it did not request.
863+
ProtocolNotFoundError:
864+
If the charm attempts to obtain an endpoint when grafana-agent isn't related to a tracing backend.
865+
"""
866+
try:
867+
return self._get_tracing_endpoint(relation or self._relation, protocol=protocol)
868+
except ProtocolNotFoundError:
869+
# let's see if we didn't find it because we didn't request the endpoint
828870
requested_protocols = set()
829871
relations = [relation] if relation else self.relations
830872
for relation in relations:
@@ -839,8 +881,7 @@ def get_tracing_endpoint(
839881
if protocol not in requested_protocols:
840882
raise ProtocolNotRequestedError(protocol, relation)
841883

842-
return None
843-
return endpoint
884+
raise
844885

845886

846887
class COSAgentDataChanged(EventBase):
@@ -902,6 +943,8 @@ def __init__(
902943
events.relation_joined, self._on_relation_data_changed
903944
) # TODO: do we need this?
904945
self.framework.observe(events.relation_changed, self._on_relation_data_changed)
946+
self.framework.observe(events.relation_departed, self._on_relation_departed)
947+
905948
for event in self._refresh_events:
906949
self.framework.observe(event, self.trigger_refresh) # pyright: ignore
907950

@@ -929,6 +972,26 @@ def _on_peer_relation_changed(self, _):
929972
if self._charm.unit.is_leader():
930973
self.on.data_changed.emit() # pyright: ignore
931974

975+
def _on_relation_departed(self, event):
976+
"""Remove provider's (principal's) alert rules and dashboards from peer data when the cos-agent relation to the principal is removed."""
977+
if not self.peer_relation:
978+
event.defer()
979+
return
980+
# empty the departing unit's alert rules and dashboards from peer data
981+
data = CosAgentPeersUnitData(
982+
unit_name=event.unit.name,
983+
relation_id=str(event.relation.id),
984+
relation_name=event.relation.name,
985+
metrics_alert_rules={},
986+
log_alert_rules={},
987+
dashboards=[],
988+
)
989+
self.peer_relation.data[self._charm.unit][
990+
f"{CosAgentPeersUnitData.KEY}-{event.unit.name}"
991+
] = data.json()
992+
993+
self.on.data_changed.emit() # pyright: ignore
994+
932995
def _on_relation_data_changed(self, event: RelationChangedEvent):
933996
# Peer data is the only means of communication between subordinate units.
934997
if not self.peer_relation:
@@ -988,7 +1051,16 @@ def update_tracing_receivers(self):
9881051
CosAgentRequirerUnitData(
9891052
receivers=[
9901053
Receiver(
991-
url=f"{self._get_tracing_receiver_url(protocol)}",
1054+
# if tracing isn't ready, we don't want the wrong receiver URLs present in the databag.
1055+
# however, because of the backwards compatibility requirements, we need to still provide
1056+
# the protocols list so that the charm with older cos_agent version doesn't error its hooks.
1057+
# before this change was added, the charm with old cos_agent version threw exceptions with
1058+
# connections to grafana-agent timing out. After the change, the charm will fail validating
1059+
# databag contents (as it expects a string in URL) but that won't cause any errors as
1060+
# tracing endpoints are the only content in the grafana-agent's side of the databag.
1061+
url=f"{self._get_tracing_receiver_url(protocol)}"
1062+
if self._charm.tracing.is_ready() # type: ignore
1063+
else None,
9921064
protocol=ProtocolType(
9931065
name=protocol,
9941066
type=receiver_protocol_to_transport_protocol[protocol],
@@ -1030,8 +1102,7 @@ def _get_requested_protocols(self, relation: Relation):
10301102
if len(units) > 1:
10311103
# should never happen
10321104
raise ValueError(
1033-
f"unexpected error: subordinate relation {relation} "
1034-
f"should have exactly one unit"
1105+
f"unexpected error: subordinate relation {relation} should have exactly one unit"
10351106
)
10361107

10371108
unit = next(iter(units), None)
@@ -1287,7 +1358,7 @@ def dashboards(self) -> List[Dict[str, str]]:
12871358
seen_apps.append(app_name)
12881359

12891360
for encoded_dashboard in data.dashboards or ():
1290-
content = GrafanaDashboard(encoded_dashboard)._deserialize()
1361+
content = json.loads(LZMABase64.decompress(encoded_dashboard))
12911362

12921363
title = content.get("title", "no_title")
12931364

@@ -1314,44 +1385,32 @@ def charm_tracing_config(
13141385
If https endpoint is provided but cert_path is not found on disk:
13151386
disable charm tracing.
13161387
If https endpoint is provided and cert_path is None:
1317-
ERROR
1388+
raise TracingError
13181389
Else:
13191390
proceed with charm tracing (with or without tls, as appropriate)
13201391
13211392
Usage:
1322-
If you are using charm_tracing >= v1.9:
1323-
>>> from lib.charms.tempo_k8s.v1.charm_tracing import trace_charm
1324-
>>> from lib.charms.tempo_k8s.v0.cos_agent import charm_tracing_config
1393+
>>> from lib.charms.tempo_coordinator_k8s.v0.charm_tracing import trace_charm
1394+
>>> from lib.charms.tempo_coordinator_k8s.v0.tracing import charm_tracing_config
13251395
>>> @trace_charm(tracing_endpoint="my_endpoint", cert_path="cert_path")
13261396
>>> class MyCharm(...):
13271397
>>> _cert_path = "/path/to/cert/on/charm/container.crt"
13281398
>>> def __init__(self, ...):
1329-
>>> self.cos_agent = COSAgentProvider(...)
1399+
>>> self.tracing = TracingEndpointRequirer(...)
13301400
>>> self.my_endpoint, self.cert_path = charm_tracing_config(
1331-
... self.cos_agent, self._cert_path)
1332-
1333-
If you are using charm_tracing < v1.9:
1334-
>>> from lib.charms.tempo_k8s.v1.charm_tracing import trace_charm
1335-
>>> from lib.charms.tempo_k8s.v2.tracing import charm_tracing_config
1336-
>>> @trace_charm(tracing_endpoint="my_endpoint", cert_path="cert_path")
1337-
>>> class MyCharm(...):
1338-
>>> _cert_path = "/path/to/cert/on/charm/container.crt"
1339-
>>> def __init__(self, ...):
1340-
>>> self.cos_agent = COSAgentProvider(...)
1341-
>>> self.my_endpoint, self.cert_path = charm_tracing_config(
1342-
... self.cos_agent, self._cert_path)
1343-
>>> @property
1344-
>>> def my_endpoint(self):
1345-
>>> return self._my_endpoint
1346-
>>> @property
1347-
>>> def cert_path(self):
1348-
>>> return self._cert_path
1349-
1401+
... self.tracing, self._cert_path)
13501402
"""
13511403
if not endpoint_requirer.is_ready():
13521404
return None, None
13531405

1354-
endpoint = endpoint_requirer.get_tracing_endpoint("otlp_http")
1406+
try:
1407+
endpoint = endpoint_requirer.get_tracing_endpoint("otlp_http")
1408+
except ProtocolNotFoundError:
1409+
logger.warn(
1410+
"Endpoint for tracing wasn't provided as tracing backend isn't ready yet. If grafana-agent isn't connected to a tracing backend, integrate it. Otherwise this issue should resolve itself in a few events."
1411+
)
1412+
return None, None
1413+
13551414
if not endpoint:
13561415
return None, None
13571416

0 commit comments

Comments
 (0)