8
8
- `COSAgentProvider`: Use in machine charms that need to have a workload's metrics
9
9
or logs scraped, or forward rule files or dashboards to Prometheus, Loki or Grafana through
10
10
the Grafana Agent machine charm.
11
+ NOTE: Be sure to add `limit: 1` in your charm for the cos-agent relation. That is the only
12
+ way we currently have to prevent two different grafana agent apps deployed on the same VM.
11
13
12
14
- `COSAgentConsumer`: Used in the Grafana Agent machine charm to manage the requirer side of
13
15
the `cos_agent` interface.
22
24
Using the `COSAgentProvider` object only requires instantiating it,
23
25
typically in the `__init__` method of your charm (the one which sends telemetry).
24
26
25
- The constructor of `COSAgentProvider` has only one required and ten optional parameters:
26
27
27
28
```python
28
29
def __init__(
@@ -233,8 +234,8 @@ def __init__(self, *args):
233
234
)
234
235
235
236
import pydantic
236
- from cosl import GrafanaDashboard , JujuTopology
237
- from cosl .rules import AlertRules
237
+ from cosl import DashboardPath40UID , JujuTopology , LZMABase64
238
+ from cosl .rules import AlertRules , generic_alert_groups
238
239
from ops .charm import RelationChangedEvent
239
240
from ops .framework import EventBase , EventSource , Object , ObjectEvents
240
241
from ops .model import ModelError , Relation
@@ -253,9 +254,9 @@ class _MetricsEndpointDict(TypedDict):
253
254
254
255
LIBID = "dc15fa84cef84ce58155fb84f6c6213a"
255
256
LIBAPI = 0
256
- LIBPATCH = 11
257
+ LIBPATCH = 20
257
258
258
- PYDEPS = ["cosl" , "pydantic" ]
259
+ PYDEPS = ["cosl >= 0.0.50 " , "pydantic" ]
259
260
260
261
DEFAULT_RELATION_NAME = "cos-agent"
261
262
DEFAULT_PEER_RELATION_NAME = "peers"
@@ -267,7 +268,6 @@ class _MetricsEndpointDict(TypedDict):
267
268
logger = logging .getLogger (__name__ )
268
269
SnapEndpoint = namedtuple ("SnapEndpoint" , "owner, name" )
269
270
270
-
271
271
# Note: MutableMapping is imported from the typing module and not collections.abc
272
272
# because subscripting collections.abc.MutableMapping was added in python 3.9, but
273
273
# most of our charms are based on 20.04, which has python 3.8.
@@ -317,7 +317,11 @@ class NotReadyError(TracingError):
317
317
"""Raised by the provider wrapper if a requirer hasn't published the required data (yet)."""
318
318
319
319
320
- class ProtocolNotRequestedError (TracingError ):
320
+ class ProtocolNotFoundError (TracingError ):
321
+ """Raised if the user doesn't receive an endpoint for a protocol it requested."""
322
+
323
+
324
+ class ProtocolNotRequestedError (ProtocolNotFoundError ):
321
325
"""Raised if the user attempts to obtain an endpoint for a protocol it did not request."""
322
326
323
327
@@ -476,7 +480,7 @@ class CosAgentProviderUnitData(DatabagModel):
476
480
# this needs to make its way to the gagent leader
477
481
metrics_alert_rules : dict
478
482
log_alert_rules : dict
479
- dashboards : List [GrafanaDashboard ]
483
+ dashboards : List [str ]
480
484
# subordinate is no longer used but we should keep it until we bump the library to ensure
481
485
# we don't break compatibility.
482
486
subordinate : Optional [bool ] = None
@@ -509,7 +513,7 @@ class CosAgentPeersUnitData(DatabagModel):
509
513
# of the outgoing o11y relations.
510
514
metrics_alert_rules : Optional [dict ]
511
515
log_alert_rules : Optional [dict ]
512
- dashboards : Optional [List [GrafanaDashboard ]]
516
+ dashboards : Optional [List [str ]]
513
517
514
518
# when this whole datastructure is dumped into a databag, it will be nested under this key.
515
519
# while not strictly necessary (we could have it 'flattened out' into the databag),
@@ -579,7 +583,7 @@ class Receiver(pydantic.BaseModel):
579
583
"""Specification of an active receiver."""
580
584
581
585
protocol : ProtocolType = pydantic .Field (..., description = "Receiver protocol name and type." )
582
- url : str = pydantic .Field (
586
+ url : Optional [ str ] = pydantic .Field (
583
587
...,
584
588
description = """URL at which the receiver is reachable. If there's an ingress, it would be the external URL.
585
589
Otherwise, it would be the service's fqdn or internal IP.
@@ -727,6 +731,10 @@ def _metrics_alert_rules(self) -> Dict:
727
731
query_type = "promql" , topology = JujuTopology .from_charm (self ._charm )
728
732
)
729
733
alert_rules .add_path (self ._metrics_rules , recursive = self ._recursive )
734
+ alert_rules .add (
735
+ generic_alert_groups .application_rules ,
736
+ group_name_prefix = JujuTopology .from_charm (self ._charm ).identifier ,
737
+ )
730
738
return alert_rules .as_dict ()
731
739
732
740
@property
@@ -737,12 +745,27 @@ def _log_alert_rules(self) -> Dict:
737
745
return alert_rules .as_dict ()
738
746
739
747
@property
740
- def _dashboards (self ) -> List [GrafanaDashboard ]:
741
- dashboards : List [GrafanaDashboard ] = []
748
+ def _dashboards (self ) -> List [str ]:
749
+ dashboards : List [str ] = []
742
750
for d in self ._dashboard_dirs :
743
751
for path in Path (d ).glob ("*" ):
744
- dashboard = GrafanaDashboard ._serialize (path .read_bytes ())
745
- dashboards .append (dashboard )
752
+ with open (path , "rt" ) as fp :
753
+ dashboard = json .load (fp )
754
+ rel_path = str (
755
+ path .relative_to (self ._charm .charm_dir ) if path .is_absolute () else path
756
+ )
757
+ # COSAgentProvider is somewhat analogous to GrafanaDashboardProvider. We need to overwrite the uid here
758
+ # because there is currently no other way to communicate the dashboard path separately.
759
+ # https://github.com/canonical/grafana-k8s-operator/pull/363
760
+ dashboard ["uid" ] = DashboardPath40UID .generate (self ._charm .meta .name , rel_path )
761
+
762
+ # Add tags
763
+ tags : List [str ] = dashboard .get ("tags" , [])
764
+ if not any (tag .startswith ("charm: " ) for tag in tags ):
765
+ tags .append (f"charm: { self ._charm .meta .name } " )
766
+ dashboard ["tags" ] = tags
767
+
768
+ dashboards .append (LZMABase64 .compress (json .dumps (dashboard )))
746
769
return dashboards
747
770
748
771
@property
@@ -768,7 +791,7 @@ def is_ready(self, relation: Optional[Relation] = None):
768
791
"""Is this endpoint ready?"""
769
792
relation = relation or self ._relation
770
793
if not relation :
771
- logger .debug (f"no relation on { self ._relation_name !r} : tracing not ready" )
794
+ logger .debug (f"no relation on { self ._relation_name !r} : tracing not ready" )
772
795
return False
773
796
if relation .data is None :
774
797
logger .error (f"relation data is None for { relation } " )
@@ -802,29 +825,48 @@ def get_all_endpoints(
802
825
803
826
def _get_tracing_endpoint (
804
827
self , relation : Optional [Relation ], protocol : ReceiverProtocol
805
- ) -> Optional [str ]:
828
+ ) -> str :
829
+ """Return a tracing endpoint URL if it is available or raise a ProtocolNotFoundError."""
806
830
unit_data = self .get_all_endpoints (relation )
807
831
if not unit_data :
808
- return None
832
+ # we didn't find the protocol because the remote end didn't publish any data yet
833
+ # it might also mean that grafana-agent doesn't have a relation to the tracing backend
834
+ raise ProtocolNotFoundError (protocol )
809
835
receivers : List [Receiver ] = [i for i in unit_data .receivers if i .protocol .name == protocol ]
810
836
if not receivers :
811
- logger .error (f"no receiver found with protocol={ protocol !r} " )
812
- return None
837
+ # we didn't find the protocol because grafana-agent didn't return us the protocol that we requested
838
+ # the caller might want to verify that we did indeed request this protocol
839
+ raise ProtocolNotFoundError (protocol )
813
840
if len (receivers ) > 1 :
814
- logger .error (
841
+ logger .warning (
815
842
f"too many receivers with protocol={ protocol !r} ; using first one. Found: { receivers } "
816
843
)
817
- return None
818
844
819
845
receiver = receivers [0 ]
846
+ if not receiver .url :
847
+ # grafana-agent isn't connected to the tracing backend yet
848
+ raise ProtocolNotFoundError (protocol )
820
849
return receiver .url
821
850
822
851
def get_tracing_endpoint (
823
852
self , protocol : ReceiverProtocol , relation : Optional [Relation ] = None
824
- ) -> Optional [str ]:
825
- """Receiver endpoint for the given protocol."""
826
- endpoint = self ._get_tracing_endpoint (relation or self ._relation , protocol = protocol )
827
- if not endpoint :
853
+ ) -> str :
854
+ """Receiver endpoint for the given protocol.
855
+
856
+ It could happen that this function gets called before the provider publishes the endpoints.
857
+ In such a scenario, if a non-leader unit calls this function, a permission denied exception will be raised due to
858
+ restricted access. To prevent this, this function needs to be guarded by the `is_ready` check.
859
+
860
+ Raises:
861
+ ProtocolNotRequestedError:
862
+ If the charm unit is the leader unit and attempts to obtain an endpoint for a protocol it did not request.
863
+ ProtocolNotFoundError:
864
+ If the charm attempts to obtain an endpoint when grafana-agent isn't related to a tracing backend.
865
+ """
866
+ try :
867
+ return self ._get_tracing_endpoint (relation or self ._relation , protocol = protocol )
868
+ except ProtocolNotFoundError :
869
+ # let's see if we didn't find it because we didn't request the endpoint
828
870
requested_protocols = set ()
829
871
relations = [relation ] if relation else self .relations
830
872
for relation in relations :
@@ -839,8 +881,7 @@ def get_tracing_endpoint(
839
881
if protocol not in requested_protocols :
840
882
raise ProtocolNotRequestedError (protocol , relation )
841
883
842
- return None
843
- return endpoint
884
+ raise
844
885
845
886
846
887
class COSAgentDataChanged (EventBase ):
@@ -902,6 +943,8 @@ def __init__(
902
943
events .relation_joined , self ._on_relation_data_changed
903
944
) # TODO: do we need this?
904
945
self .framework .observe (events .relation_changed , self ._on_relation_data_changed )
946
+ self .framework .observe (events .relation_departed , self ._on_relation_departed )
947
+
905
948
for event in self ._refresh_events :
906
949
self .framework .observe (event , self .trigger_refresh ) # pyright: ignore
907
950
@@ -929,6 +972,26 @@ def _on_peer_relation_changed(self, _):
929
972
if self ._charm .unit .is_leader ():
930
973
self .on .data_changed .emit () # pyright: ignore
931
974
975
+ def _on_relation_departed (self , event ):
976
+ """Remove provider's (principal's) alert rules and dashboards from peer data when the cos-agent relation to the principal is removed."""
977
+ if not self .peer_relation :
978
+ event .defer ()
979
+ return
980
+ # empty the departing unit's alert rules and dashboards from peer data
981
+ data = CosAgentPeersUnitData (
982
+ unit_name = event .unit .name ,
983
+ relation_id = str (event .relation .id ),
984
+ relation_name = event .relation .name ,
985
+ metrics_alert_rules = {},
986
+ log_alert_rules = {},
987
+ dashboards = [],
988
+ )
989
+ self .peer_relation .data [self ._charm .unit ][
990
+ f"{ CosAgentPeersUnitData .KEY } -{ event .unit .name } "
991
+ ] = data .json ()
992
+
993
+ self .on .data_changed .emit () # pyright: ignore
994
+
932
995
def _on_relation_data_changed (self , event : RelationChangedEvent ):
933
996
# Peer data is the only means of communication between subordinate units.
934
997
if not self .peer_relation :
@@ -988,7 +1051,16 @@ def update_tracing_receivers(self):
988
1051
CosAgentRequirerUnitData (
989
1052
receivers = [
990
1053
Receiver (
991
- url = f"{ self ._get_tracing_receiver_url (protocol )} " ,
1054
+ # if tracing isn't ready, we don't want the wrong receiver URLs present in the databag.
1055
+ # however, because of the backwards compatibility requirements, we need to still provide
1056
+ # the protocols list so that the charm with older cos_agent version doesn't error its hooks.
1057
+ # before this change was added, the charm with old cos_agent version threw exceptions with
1058
+ # connections to grafana-agent timing out. After the change, the charm will fail validating
1059
+ # databag contents (as it expects a string in URL) but that won't cause any errors as
1060
+ # tracing endpoints are the only content in the grafana-agent's side of the databag.
1061
+ url = f"{ self ._get_tracing_receiver_url (protocol )} "
1062
+ if self ._charm .tracing .is_ready () # type: ignore
1063
+ else None ,
992
1064
protocol = ProtocolType (
993
1065
name = protocol ,
994
1066
type = receiver_protocol_to_transport_protocol [protocol ],
@@ -1030,8 +1102,7 @@ def _get_requested_protocols(self, relation: Relation):
1030
1102
if len (units ) > 1 :
1031
1103
# should never happen
1032
1104
raise ValueError (
1033
- f"unexpected error: subordinate relation { relation } "
1034
- f"should have exactly one unit"
1105
+ f"unexpected error: subordinate relation { relation } should have exactly one unit"
1035
1106
)
1036
1107
1037
1108
unit = next (iter (units ), None )
@@ -1287,7 +1358,7 @@ def dashboards(self) -> List[Dict[str, str]]:
1287
1358
seen_apps .append (app_name )
1288
1359
1289
1360
for encoded_dashboard in data .dashboards or ():
1290
- content = GrafanaDashboard ( encoded_dashboard ). _deserialize ( )
1361
+ content = json . loads ( LZMABase64 . decompress ( encoded_dashboard ) )
1291
1362
1292
1363
title = content .get ("title" , "no_title" )
1293
1364
@@ -1314,44 +1385,32 @@ def charm_tracing_config(
1314
1385
If https endpoint is provided but cert_path is not found on disk:
1315
1386
disable charm tracing.
1316
1387
If https endpoint is provided and cert_path is None:
1317
- ERROR
1388
+ raise TracingError
1318
1389
Else:
1319
1390
proceed with charm tracing (with or without tls, as appropriate)
1320
1391
1321
1392
Usage:
1322
- If you are using charm_tracing >= v1.9:
1323
- >>> from lib.charms.tempo_k8s.v1.charm_tracing import trace_charm
1324
- >>> from lib.charms.tempo_k8s.v0.cos_agent import charm_tracing_config
1393
+ >>> from lib.charms.tempo_coordinator_k8s.v0.charm_tracing import trace_charm
1394
+ >>> from lib.charms.tempo_coordinator_k8s.v0.tracing import charm_tracing_config
1325
1395
>>> @trace_charm(tracing_endpoint="my_endpoint", cert_path="cert_path")
1326
1396
>>> class MyCharm(...):
1327
1397
>>> _cert_path = "/path/to/cert/on/charm/container.crt"
1328
1398
>>> def __init__(self, ...):
1329
- >>> self.cos_agent = COSAgentProvider (...)
1399
+ >>> self.tracing = TracingEndpointRequirer (...)
1330
1400
>>> self.my_endpoint, self.cert_path = charm_tracing_config(
1331
- ... self.cos_agent, self._cert_path)
1332
-
1333
- If you are using charm_tracing < v1.9:
1334
- >>> from lib.charms.tempo_k8s.v1.charm_tracing import trace_charm
1335
- >>> from lib.charms.tempo_k8s.v2.tracing import charm_tracing_config
1336
- >>> @trace_charm(tracing_endpoint="my_endpoint", cert_path="cert_path")
1337
- >>> class MyCharm(...):
1338
- >>> _cert_path = "/path/to/cert/on/charm/container.crt"
1339
- >>> def __init__(self, ...):
1340
- >>> self.cos_agent = COSAgentProvider(...)
1341
- >>> self.my_endpoint, self.cert_path = charm_tracing_config(
1342
- ... self.cos_agent, self._cert_path)
1343
- >>> @property
1344
- >>> def my_endpoint(self):
1345
- >>> return self._my_endpoint
1346
- >>> @property
1347
- >>> def cert_path(self):
1348
- >>> return self._cert_path
1349
-
1401
+ ... self.tracing, self._cert_path)
1350
1402
"""
1351
1403
if not endpoint_requirer .is_ready ():
1352
1404
return None , None
1353
1405
1354
- endpoint = endpoint_requirer .get_tracing_endpoint ("otlp_http" )
1406
+ try :
1407
+ endpoint = endpoint_requirer .get_tracing_endpoint ("otlp_http" )
1408
+ except ProtocolNotFoundError :
1409
+ logger .warn (
1410
+ "Endpoint for tracing wasn't provided as tracing backend isn't ready yet. If grafana-agent isn't connected to a tracing backend, integrate it. Otherwise this issue should resolve itself in a few events."
1411
+ )
1412
+ return None , None
1413
+
1355
1414
if not endpoint :
1356
1415
return None , None
1357
1416
0 commit comments