8
8
- `COSAgentProvider`: Use in machine charms that need to have a workload's metrics
9
9
or logs scraped, or forward rule files or dashboards to Prometheus, Loki or Grafana through
10
10
the Grafana Agent machine charm.
11
+ NOTE: Be sure to add `limit: 1` in your charm for the cos-agent relation. That is the only
12
+ way we currently have to prevent two different grafana agent apps deployed on the same VM.
11
13
12
14
- `COSAgentConsumer`: Used in the Grafana Agent machine charm to manage the requirer side of
13
15
the `cos_agent` interface.
22
24
Using the `COSAgentProvider` object only requires instantiating it,
23
25
typically in the `__init__` method of your charm (the one which sends telemetry).
24
26
25
- The constructor of `COSAgentProvider` has only one required and ten optional parameters:
26
27
27
28
```python
28
29
def __init__(
@@ -233,8 +234,8 @@ def __init__(self, *args):
233
234
)
234
235
235
236
import pydantic
236
- from cosl import GrafanaDashboard , JujuTopology
237
- from cosl .rules import AlertRules
237
+ from cosl import DashboardPath40UID , JujuTopology , LZMABase64
238
+ from cosl .rules import AlertRules , generic_alert_groups
238
239
from ops .charm import RelationChangedEvent
239
240
from ops .framework import EventBase , EventSource , Object , ObjectEvents
240
241
from ops .model import ModelError , Relation
@@ -253,9 +254,9 @@ class _MetricsEndpointDict(TypedDict):
253
254
254
255
LIBID = "dc15fa84cef84ce58155fb84f6c6213a"
255
256
LIBAPI = 0
256
- LIBPATCH = 11
257
+ LIBPATCH = 19
257
258
258
- PYDEPS = ["cosl" , "pydantic" ]
259
+ PYDEPS = ["cosl >= 0.0.50 " , "pydantic" ]
259
260
260
261
DEFAULT_RELATION_NAME = "cos-agent"
261
262
DEFAULT_PEER_RELATION_NAME = "peers"
@@ -267,7 +268,6 @@ class _MetricsEndpointDict(TypedDict):
267
268
logger = logging .getLogger (__name__ )
268
269
SnapEndpoint = namedtuple ("SnapEndpoint" , "owner, name" )
269
270
270
-
271
271
# Note: MutableMapping is imported from the typing module and not collections.abc
272
272
# because subscripting collections.abc.MutableMapping was added in python 3.9, but
273
273
# most of our charms are based on 20.04, which has python 3.8.
@@ -317,7 +317,11 @@ class NotReadyError(TracingError):
317
317
"""Raised by the provider wrapper if a requirer hasn't published the required data (yet)."""
318
318
319
319
320
- class ProtocolNotRequestedError (TracingError ):
320
+ class ProtocolNotFoundError (TracingError ):
321
+ """Raised if the user doesn't receive an endpoint for a protocol it requested."""
322
+
323
+
324
+ class ProtocolNotRequestedError (ProtocolNotFoundError ):
321
325
"""Raised if the user attempts to obtain an endpoint for a protocol it did not request."""
322
326
323
327
@@ -476,7 +480,7 @@ class CosAgentProviderUnitData(DatabagModel):
476
480
# this needs to make its way to the gagent leader
477
481
metrics_alert_rules : dict
478
482
log_alert_rules : dict
479
- dashboards : List [GrafanaDashboard ]
483
+ dashboards : List [str ]
480
484
# subordinate is no longer used but we should keep it until we bump the library to ensure
481
485
# we don't break compatibility.
482
486
subordinate : Optional [bool ] = None
@@ -509,7 +513,7 @@ class CosAgentPeersUnitData(DatabagModel):
509
513
# of the outgoing o11y relations.
510
514
metrics_alert_rules : Optional [dict ]
511
515
log_alert_rules : Optional [dict ]
512
- dashboards : Optional [List [GrafanaDashboard ]]
516
+ dashboards : Optional [List [str ]]
513
517
514
518
# when this whole datastructure is dumped into a databag, it will be nested under this key.
515
519
# while not strictly necessary (we could have it 'flattened out' into the databag),
@@ -579,7 +583,7 @@ class Receiver(pydantic.BaseModel):
579
583
"""Specification of an active receiver."""
580
584
581
585
protocol : ProtocolType = pydantic .Field (..., description = "Receiver protocol name and type." )
582
- url : str = pydantic .Field (
586
+ url : Optional [ str ] = pydantic .Field (
583
587
...,
584
588
description = """URL at which the receiver is reachable. If there's an ingress, it would be the external URL.
585
589
Otherwise, it would be the service's fqdn or internal IP.
@@ -727,6 +731,10 @@ def _metrics_alert_rules(self) -> Dict:
727
731
query_type = "promql" , topology = JujuTopology .from_charm (self ._charm )
728
732
)
729
733
alert_rules .add_path (self ._metrics_rules , recursive = self ._recursive )
734
+ alert_rules .add (
735
+ generic_alert_groups .application_rules ,
736
+ group_name_prefix = JujuTopology .from_charm (self ._charm ).identifier ,
737
+ )
730
738
return alert_rules .as_dict ()
731
739
732
740
@property
@@ -737,12 +745,20 @@ def _log_alert_rules(self) -> Dict:
737
745
return alert_rules .as_dict ()
738
746
739
747
@property
740
- def _dashboards (self ) -> List [GrafanaDashboard ]:
741
- dashboards : List [GrafanaDashboard ] = []
748
+ def _dashboards (self ) -> List [str ]:
749
+ dashboards : List [str ] = []
742
750
for d in self ._dashboard_dirs :
743
751
for path in Path (d ).glob ("*" ):
744
- dashboard = GrafanaDashboard ._serialize (path .read_bytes ())
745
- dashboards .append (dashboard )
752
+ with open (path , "rt" ) as fp :
753
+ dashboard = json .load (fp )
754
+ rel_path = str (
755
+ path .relative_to (self ._charm .charm_dir ) if path .is_absolute () else path
756
+ )
757
+ # COSAgentProvider is somewhat analogous to GrafanaDashboardProvider. We need to overwrite the uid here
758
+ # because there is currently no other way to communicate the dashboard path separately.
759
+ # https://github.com/canonical/grafana-k8s-operator/pull/363
760
+ dashboard ["uid" ] = DashboardPath40UID .generate (self ._charm .meta .name , rel_path )
761
+ dashboards .append (LZMABase64 .compress (json .dumps (dashboard )))
746
762
return dashboards
747
763
748
764
@property
@@ -768,7 +784,7 @@ def is_ready(self, relation: Optional[Relation] = None):
768
784
"""Is this endpoint ready?"""
769
785
relation = relation or self ._relation
770
786
if not relation :
771
- logger .debug (f"no relation on { self ._relation_name !r} : tracing not ready" )
787
+ logger .debug (f"no relation on { self ._relation_name !r} : tracing not ready" )
772
788
return False
773
789
if relation .data is None :
774
790
logger .error (f"relation data is None for { relation } " )
@@ -802,29 +818,48 @@ def get_all_endpoints(
802
818
803
819
def _get_tracing_endpoint (
804
820
self , relation : Optional [Relation ], protocol : ReceiverProtocol
805
- ) -> Optional [str ]:
821
+ ) -> str :
822
+ """Return a tracing endpoint URL if it is available or raise a ProtocolNotFoundError."""
806
823
unit_data = self .get_all_endpoints (relation )
807
824
if not unit_data :
808
- return None
825
+ # we didn't find the protocol because the remote end didn't publish any data yet
826
+ # it might also mean that grafana-agent doesn't have a relation to the tracing backend
827
+ raise ProtocolNotFoundError (protocol )
809
828
receivers : List [Receiver ] = [i for i in unit_data .receivers if i .protocol .name == protocol ]
810
829
if not receivers :
811
- logger .error (f"no receiver found with protocol={ protocol !r} " )
812
- return None
830
+ # we didn't find the protocol because grafana-agent didn't return us the protocol that we requested
831
+ # the caller might want to verify that we did indeed request this protocol
832
+ raise ProtocolNotFoundError (protocol )
813
833
if len (receivers ) > 1 :
814
- logger .error (
834
+ logger .warning (
815
835
f"too many receivers with protocol={ protocol !r} ; using first one. Found: { receivers } "
816
836
)
817
- return None
818
837
819
838
receiver = receivers [0 ]
839
+ if not receiver .url :
840
+ # grafana-agent isn't connected to the tracing backend yet
841
+ raise ProtocolNotFoundError (protocol )
820
842
return receiver .url
821
843
822
844
def get_tracing_endpoint (
823
845
self , protocol : ReceiverProtocol , relation : Optional [Relation ] = None
824
- ) -> Optional [str ]:
825
- """Receiver endpoint for the given protocol."""
826
- endpoint = self ._get_tracing_endpoint (relation or self ._relation , protocol = protocol )
827
- if not endpoint :
846
+ ) -> str :
847
+ """Receiver endpoint for the given protocol.
848
+
849
+ It could happen that this function gets called before the provider publishes the endpoints.
850
+ In such a scenario, if a non-leader unit calls this function, a permission denied exception will be raised due to
851
+ restricted access. To prevent this, this function needs to be guarded by the `is_ready` check.
852
+
853
+ Raises:
854
+ ProtocolNotRequestedError:
855
+ If the charm unit is the leader unit and attempts to obtain an endpoint for a protocol it did not request.
856
+ ProtocolNotFoundError:
857
+ If the charm attempts to obtain an endpoint when grafana-agent isn't related to a tracing backend.
858
+ """
859
+ try :
860
+ return self ._get_tracing_endpoint (relation or self ._relation , protocol = protocol )
861
+ except ProtocolNotFoundError :
862
+ # let's see if we didn't find it because we didn't request the endpoint
828
863
requested_protocols = set ()
829
864
relations = [relation ] if relation else self .relations
830
865
for relation in relations :
@@ -839,8 +874,7 @@ def get_tracing_endpoint(
839
874
if protocol not in requested_protocols :
840
875
raise ProtocolNotRequestedError (protocol , relation )
841
876
842
- return None
843
- return endpoint
877
+ raise
844
878
845
879
846
880
class COSAgentDataChanged (EventBase ):
@@ -902,6 +936,8 @@ def __init__(
902
936
events .relation_joined , self ._on_relation_data_changed
903
937
) # TODO: do we need this?
904
938
self .framework .observe (events .relation_changed , self ._on_relation_data_changed )
939
+ self .framework .observe (events .relation_departed , self ._on_relation_departed )
940
+
905
941
for event in self ._refresh_events :
906
942
self .framework .observe (event , self .trigger_refresh ) # pyright: ignore
907
943
@@ -929,6 +965,26 @@ def _on_peer_relation_changed(self, _):
929
965
if self ._charm .unit .is_leader ():
930
966
self .on .data_changed .emit () # pyright: ignore
931
967
968
+ def _on_relation_departed (self , event ):
969
+ """Remove provider's (principal's) alert rules and dashboards from peer data when the cos-agent relation to the principal is removed."""
970
+ if not self .peer_relation :
971
+ event .defer ()
972
+ return
973
+ # empty the departing unit's alert rules and dashboards from peer data
974
+ data = CosAgentPeersUnitData (
975
+ unit_name = event .unit .name ,
976
+ relation_id = str (event .relation .id ),
977
+ relation_name = event .relation .name ,
978
+ metrics_alert_rules = {},
979
+ log_alert_rules = {},
980
+ dashboards = [],
981
+ )
982
+ self .peer_relation .data [self ._charm .unit ][
983
+ f"{ CosAgentPeersUnitData .KEY } -{ event .unit .name } "
984
+ ] = data .json ()
985
+
986
+ self .on .data_changed .emit () # pyright: ignore
987
+
932
988
def _on_relation_data_changed (self , event : RelationChangedEvent ):
933
989
# Peer data is the only means of communication between subordinate units.
934
990
if not self .peer_relation :
@@ -988,7 +1044,16 @@ def update_tracing_receivers(self):
988
1044
CosAgentRequirerUnitData (
989
1045
receivers = [
990
1046
Receiver (
991
- url = f"{ self ._get_tracing_receiver_url (protocol )} " ,
1047
+ # if tracing isn't ready, we don't want the wrong receiver URLs present in the databag.
1048
+ # however, because of the backwards compatibility requirements, we need to still provide
1049
+ # the protocols list so that the charm with older cos_agent version doesn't error its hooks.
1050
+ # before this change was added, the charm with old cos_agent version threw exceptions with
1051
+ # connections to grafana-agent timing out. After the change, the charm will fail validating
1052
+ # databag contents (as it expects a string in URL) but that won't cause any errors as
1053
+ # tracing endpoints are the only content in the grafana-agent's side of the databag.
1054
+ url = f"{ self ._get_tracing_receiver_url (protocol )} "
1055
+ if self ._charm .tracing .is_ready () # type: ignore
1056
+ else None ,
992
1057
protocol = ProtocolType (
993
1058
name = protocol ,
994
1059
type = receiver_protocol_to_transport_protocol [protocol ],
@@ -1030,8 +1095,7 @@ def _get_requested_protocols(self, relation: Relation):
1030
1095
if len (units ) > 1 :
1031
1096
# should never happen
1032
1097
raise ValueError (
1033
- f"unexpected error: subordinate relation { relation } "
1034
- f"should have exactly one unit"
1098
+ f"unexpected error: subordinate relation { relation } should have exactly one unit"
1035
1099
)
1036
1100
1037
1101
unit = next (iter (units ), None )
@@ -1287,7 +1351,7 @@ def dashboards(self) -> List[Dict[str, str]]:
1287
1351
seen_apps .append (app_name )
1288
1352
1289
1353
for encoded_dashboard in data .dashboards or ():
1290
- content = GrafanaDashboard ( encoded_dashboard ). _deserialize ( )
1354
+ content = json . loads ( LZMABase64 . decompress ( encoded_dashboard ) )
1291
1355
1292
1356
title = content .get ("title" , "no_title" )
1293
1357
@@ -1314,44 +1378,32 @@ def charm_tracing_config(
1314
1378
If https endpoint is provided but cert_path is not found on disk:
1315
1379
disable charm tracing.
1316
1380
If https endpoint is provided and cert_path is None:
1317
- ERROR
1381
+ raise TracingError
1318
1382
Else:
1319
1383
proceed with charm tracing (with or without tls, as appropriate)
1320
1384
1321
1385
Usage:
1322
- If you are using charm_tracing >= v1.9:
1323
- >>> from lib.charms.tempo_k8s.v1.charm_tracing import trace_charm
1324
- >>> from lib.charms.tempo_k8s.v0.cos_agent import charm_tracing_config
1386
+ >>> from lib.charms.tempo_coordinator_k8s.v0.charm_tracing import trace_charm
1387
+ >>> from lib.charms.tempo_coordinator_k8s.v0.tracing import charm_tracing_config
1325
1388
>>> @trace_charm(tracing_endpoint="my_endpoint", cert_path="cert_path")
1326
1389
>>> class MyCharm(...):
1327
1390
>>> _cert_path = "/path/to/cert/on/charm/container.crt"
1328
1391
>>> def __init__(self, ...):
1329
- >>> self.cos_agent = COSAgentProvider (...)
1392
+ >>> self.tracing = TracingEndpointRequirer (...)
1330
1393
>>> self.my_endpoint, self.cert_path = charm_tracing_config(
1331
- ... self.cos_agent, self._cert_path)
1332
-
1333
- If you are using charm_tracing < v1.9:
1334
- >>> from lib.charms.tempo_k8s.v1.charm_tracing import trace_charm
1335
- >>> from lib.charms.tempo_k8s.v2.tracing import charm_tracing_config
1336
- >>> @trace_charm(tracing_endpoint="my_endpoint", cert_path="cert_path")
1337
- >>> class MyCharm(...):
1338
- >>> _cert_path = "/path/to/cert/on/charm/container.crt"
1339
- >>> def __init__(self, ...):
1340
- >>> self.cos_agent = COSAgentProvider(...)
1341
- >>> self.my_endpoint, self.cert_path = charm_tracing_config(
1342
- ... self.cos_agent, self._cert_path)
1343
- >>> @property
1344
- >>> def my_endpoint(self):
1345
- >>> return self._my_endpoint
1346
- >>> @property
1347
- >>> def cert_path(self):
1348
- >>> return self._cert_path
1349
-
1394
+ ... self.tracing, self._cert_path)
1350
1395
"""
1351
1396
if not endpoint_requirer .is_ready ():
1352
1397
return None , None
1353
1398
1354
- endpoint = endpoint_requirer .get_tracing_endpoint ("otlp_http" )
1399
+ try :
1400
+ endpoint = endpoint_requirer .get_tracing_endpoint ("otlp_http" )
1401
+ except ProtocolNotFoundError :
1402
+ logger .warn (
1403
+ "Endpoint for tracing wasn't provided as tracing backend isn't ready yet. If grafana-agent isn't connected to a tracing backend, integrate it. Otherwise this issue should resolve itself in a few events."
1404
+ )
1405
+ return None , None
1406
+
1355
1407
if not endpoint :
1356
1408
return None , None
1357
1409
0 commit comments