Skip to content

Commit f6021bd

Browse files
committed
mgr/dashboard: Migrate from promtail to grafana alloy
Since promtail is now deprecated, we need to start using grafana alloy for centralized logging setup Fixes: https://tracker.ceph.com/issues/71072 Signed-off-by: Aashish Sharma <[email protected]>
1 parent 3f51eeb commit f6021bd

File tree

17 files changed

+194
-23
lines changed

17 files changed

+194
-23
lines changed

doc/cephadm/services/monitoring.rst

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ retrieve the current credentials.
119119
Centralized Logging in Ceph
120120
~~~~~~~~~~~~~~~~~~~~~~~~~~~
121121

122-
Ceph now provides centralized logging with Loki & Promtail. Centralized Log Management (CLM) consolidates all log data and pushes it to a central repository,
122+
Ceph now provides centralized logging with Loki and Alloy. Centralized Log Management (CLM) consolidates all log data and pushes it to a central repository,
123123
with an accessible and easy-to-use interface. Centralized logging is designed to make your life easier.
124124
Some of the advantages are:
125125

@@ -128,13 +128,12 @@ Some of the advantages are:
128128
#. **Flexible retention policies**: with per-daemon logs, log rotation is usually set to a short interval (1-2 weeks) to save disk usage.
129129
#. **Increased security & backup**: logs can contain sensitive information and expose usage patterns. Additionally, centralized logging allows for HA, etc.
130130

131-
Centralized Logging in Ceph is implemented using two new services - ``loki`` & ``promtail``.
131+
Centralized Logging in Ceph is implemented using two services: ``loki`` and ``alloy``.
132132

133-
Loki: It is basically a log aggregation system and is used to query logs. It can be configured as a datasource in Grafana.
133+
* Loki is a log aggregation system and is used to query logs. It can be configured as a ``datasource`` in Grafana.
134+
* Alloy acts as an agent that gathers logs from each node and forwards them to Loki.
134135

135-
Promtail: It acts as an agent that gathers logs from the system and makes them available to Loki.
136-
137-
These two services are not deployed by default in a Ceph cluster. To enable the centralized logging you can follow the steps mentioned here :ref:`centralized-logging`.
136+
These two services are not deployed by default in a Ceph cluster. To enable centralized logging you can follow the steps mentioned here :ref:`centralized-logging`.
138137

139138
.. _cephadm-monitoring-networks-ports:
140139

src/cephadm/cephadm.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -655,6 +655,9 @@ def create_daemon_dirs(
655655
elif daemon_type == 'promtail':
656656
data_dir_root = ident.data_dir(ctx.data_dir)
657657
config_dir = 'etc/promtail'
658+
elif daemon_type == 'alloy':
659+
data_dir_root = ident.data_dir(ctx.data_dir)
660+
config_dir = 'etc/alloy'
658661
makedirs(os.path.join(data_dir_root, config_dir), uid, gid, 0o755)
659662
makedirs(os.path.join(data_dir_root, 'data'), uid, gid, 0o755)
660663
elif daemon_type == 'loki':
@@ -2250,7 +2253,7 @@ def prepare_ssh(
22502253
'Perhaps the ceph version being bootstrapped does not support it')
22512254

22522255
if ctx.with_centralized_logging:
2253-
for t in ['loki', 'promtail']:
2256+
for t in ['loki', 'alloy']:
22542257
logger.info('Deploying %s service with default placement...' % t)
22552258
try:
22562259
cli(['orch', 'apply', t])
@@ -5009,7 +5012,7 @@ def _get_parser():
50095012
parser_bootstrap.add_argument(
50105013
'--with-centralized-logging',
50115014
action='store_true',
5012-
help='Automatically provision centralized logging (promtail, loki)')
5015+
help='Automatically provision centralized logging (alloy, loki)')
50135016
parser_bootstrap.add_argument(
50145017
'--apply-spec',
50155018
help='Apply cluster spec after bootstrap (copy ssh key, add hosts and apply services)')

src/cephadm/cephadmlib/daemons/monitoring.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ class Monitoring(ContainerDaemonForm):
3838
'alertmanager': [9093, 9094],
3939
'loki': [3100],
4040
'promtail': [9080],
41+
'alloy': [9080],
4142
}
4243

4344
components = {
@@ -73,6 +74,17 @@ class Monitoring(ContainerDaemonForm):
7374
'promtail.yml',
7475
],
7576
},
77+
'alloy': {
78+
'image': DefaultImages.ALLOY.image_ref,
79+
'cpus': '1',
80+
'memory': '1GB',
81+
'args': [
82+
'run',
83+
'/etc/alloy/config.alloy',
84+
'--storage.path=/var/lib/alloy/data',
85+
],
86+
'config-json-files': ['config.alloy'],
87+
},
7688
'node-exporter': {
7789
'image': DefaultImages.NODE_EXPORTER.image_ref,
7890
'cpus': '1',
@@ -112,14 +124,15 @@ def for_daemon_type(cls, daemon_type: str) -> bool:
112124
def get_version(ctx, container_id, daemon_type):
113125
# type: (CephadmContext, str, str) -> str
114126
"""
115-
:param: daemon_type Either "prometheus", "alertmanager", "loki", "promtail" or "node-exporter"
127+
:param: daemon_type Either "prometheus", "alertmanager", "loki", "alloy" or "node-exporter"
116128
"""
117129
assert daemon_type in (
118130
'prometheus',
119131
'alertmanager',
120132
'node-exporter',
121133
'loki',
122134
'promtail',
135+
'alloy',
123136
)
124137
cmd = daemon_type.replace('-', '_')
125138
code = -1
@@ -175,6 +188,8 @@ def extract_uid_gid(
175188
uid, gid = extract_uid_gid(ctx, file_path='/etc/loki')
176189
elif daemon_type == 'promtail':
177190
uid, gid = extract_uid_gid(ctx, file_path='/etc/promtail')
191+
elif daemon_type == 'alloy':
192+
uid, gid = extract_uid_gid(ctx, file_path='/etc/alloy')
178193
elif daemon_type == 'alertmanager':
179194
uid, gid = extract_uid_gid(
180195
ctx, file_path=['/etc/alertmanager', '/etc/prometheus']
@@ -240,7 +255,7 @@ def get_daemon_args(self) -> List[str]:
240255
metadata = self.components[daemon_type]
241256
r = list(metadata.get('args', []))
242257
# set ip and port to bind to for nodeexporter,alertmanager,prometheus
243-
if daemon_type not in ['grafana', 'loki', 'promtail']:
258+
if daemon_type not in ['grafana', 'loki', 'promtail', 'alloy']:
244259
ip = ''
245260
port = self.port_map[daemon_type][0]
246261
meta = fetch_meta(ctx)
@@ -333,6 +348,10 @@ def _get_container_mounts(self, data_dir: str) -> Dict[str, str]:
333348
mounts[os.path.join(data_dir, 'etc/promtail')] = '/etc/promtail:Z'
334349
mounts[log_dir] = '/var/log/ceph:z'
335350
mounts[os.path.join(data_dir, 'data')] = '/promtail:Z'
351+
elif daemon_type == 'alloy':
352+
mounts[os.path.join(data_dir, 'etc/alloy')] = '/etc/alloy:Z'
353+
mounts[log_dir] = '/var/log/ceph:z'
354+
mounts[os.path.join(data_dir, 'data')] = '/var/lib/alloy/data:Z'
336355
elif daemon_type == 'node-exporter':
337356
mounts[
338357
os.path.join(data_dir, 'etc/node-exporter')
@@ -379,6 +398,8 @@ def customize_container_args(
379398
# by ubuntu 18.04 kernel!)
380399
]
381400
args.extend(monitoring_args)
401+
if self.identity.daemon_type == 'alloy':
402+
args.extend(['--user=root'])
382403
if self.identity.daemon_type == 'node-exporter':
383404
# in order to support setting '--path.procfs=/host/proc','--path.sysfs=/host/sys',
384405
# '--path.rootfs=/rootfs' for node-exporter we need to disable selinux separation

src/cephadm/cephadmlib/listing_updaters.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,7 @@ def update(
240240
'node-exporter',
241241
'loki',
242242
'promtail',
243+
'alloy',
243244
]:
244245
version = Monitoring.get_version(
245246
ctx, container_id, daemon_type

src/pybind/mgr/cephadm/migrations.py

Lines changed: 71 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
if TYPE_CHECKING:
1717
from .module import CephadmOrchestrator
1818

19-
LAST_MIGRATION = 8
19+
LAST_MIGRATION = 9
2020

2121
logger = logging.getLogger(__name__)
2222

@@ -86,7 +86,6 @@ def verify_no_migration(self) -> None:
8686
"cephadm migration still ongoing. Please wait, until the migration is complete.")
8787

8888
def migrate(self, startup: bool = False) -> None:
89-
9089
logger.info('running migrations')
9190

9291
if self.mgr.migration_current == 0:
@@ -121,6 +120,10 @@ def migrate(self, startup: bool = False) -> None:
121120
if self.migrate_7_8():
122121
self.set(8)
123122

123+
if self.mgr.migration_current == 8:
124+
if self.migrate_8_9():
125+
self.set(9)
126+
124127
def migrate_0_1(self) -> bool:
125128
"""
126129
Migration 0 -> 1
@@ -498,6 +501,72 @@ def migrate_7_8(self) -> bool:
498501
self.rgw_ssl_migration_queue = []
499502
return True
500503

504+
def migrate_8_9(self) -> bool:
505+
"""
506+
Replace Promtail with Alloy.
507+
508+
- If mgr daemons are still being upgraded, return True WITHOUT bumping migration_current.
509+
- Mark Promtail service unmanaged so cephadm won't redeploy it.
510+
- Remove Promtail daemons to free ports.
511+
- Deploy Alloy with Promtail's placement.
512+
- Once Alloy is confirmed deployed, remove Promtail service spec.
513+
"""
514+
try:
515+
target_digests = getattr(self.mgr.upgrade.upgrade_state, "target_digests", [])
516+
active_mgr_digests = self.mgr.get_active_mgr_digests()
517+
518+
if target_digests:
519+
if not any(d in target_digests for d in active_mgr_digests):
520+
logger.info(
521+
"Promtail -> Alloy migration: mgr daemons still upgrading. "
522+
"Marking as complete without bumping migration_current."
523+
)
524+
return False
525+
526+
promtail_spec = self.mgr.spec_store.active_specs.get("promtail")
527+
if not promtail_spec:
528+
logger.info("Promtail -> Alloy migration: no Promtail \
529+
service found, nothing to do.")
530+
return True
531+
532+
if not promtail_spec.unmanaged:
533+
logger.info("Promtail -> Alloy migration: marking promtail unmanaged")
534+
self.mgr.spec_store.set_unmanaged("promtail", True)
535+
536+
daemons = self.mgr.cache.get_daemons()
537+
promtail_daemons = [d for d in daemons if d.daemon_type == "promtail"]
538+
if promtail_daemons:
539+
promtail_names = [d.name() for d in promtail_daemons]
540+
logger.info(f"Promtail -> Alloy migration: removing daemons {promtail_names}")
541+
self.mgr.remove_daemons(promtail_names)
542+
543+
daemons = self.mgr.cache.get_daemons()
544+
if any(d.daemon_type == "promtail" for d in daemons):
545+
logger.info(
546+
"Promtail -> Alloy migration: promtail daemons still present, "
547+
"skipping Alloy deployment until next run."
548+
)
549+
return False
550+
551+
alloy_spec = ServiceSpec(
552+
service_type="alloy",
553+
service_id="alloy",
554+
placement=promtail_spec.placement
555+
)
556+
557+
logger.info("Promtail -> Alloy migration: deploying Alloy service")
558+
self.mgr.apply_alloy(alloy_spec)
559+
560+
logger.info("Promtail -> Alloy migration: removing promtail service spec")
561+
self.mgr.remove_service("promtail")
562+
563+
logger.info("Promtail -> Alloy migration completed successfully.")
564+
return True
565+
566+
except Exception as e:
567+
logger.error(f"Promtail -> Alloy migration failed: {e}")
568+
return False
569+
501570

502571
def queue_migrate_rgw_spec(mgr: "CephadmOrchestrator", spec_dict: Dict[Any, Any]) -> None:
503572
"""

src/pybind/mgr/cephadm/module.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -537,6 +537,7 @@ def __init__(self, *args: Any, **kwargs: Any):
537537
self.container_image_node_exporter = ''
538538
self.container_image_loki = ''
539539
self.container_image_promtail = ''
540+
self.container_image_alloy = ''
540541
self.container_image_haproxy = ''
541542
self.container_image_keepalived = ''
542543
self.container_image_snmp_gateway = ''
@@ -931,7 +932,7 @@ def get_unique_name(
931932
suffix = daemon_type not in [
932933
'mon', 'crash', 'ceph-exporter', 'node-proxy',
933934
'prometheus', 'node-exporter', 'grafana', 'alertmanager',
934-
'container', 'agent', 'snmp-gateway', 'loki', 'promtail',
935+
'container', 'agent', 'snmp-gateway', 'loki', 'promtail', 'alloy',
935936
'elasticsearch', 'jaeger-collector', 'jaeger-agent', 'jaeger-query', 'mgmt-gateway', 'oauth2-proxy'
936937
]
937938
if forcename:
@@ -1747,6 +1748,7 @@ def get_container_image(
17471748
'nvmeof': self.container_image_nvmeof,
17481749
'prometheus': self.container_image_prometheus,
17491750
'promtail': self.container_image_promtail,
1751+
'alloy': self.container_image_alloy,
17501752
'snmp-gateway': self.container_image_snmp_gateway,
17511753
'mgmt-gateway': self.container_image_nginx,
17521754
'oauth2-proxy': self.container_image_oauth2_proxy,
@@ -3756,6 +3758,7 @@ def _apply_service_spec(self, spec: ServiceSpec) -> str:
37563758
'ceph-exporter': PlacementSpec(host_pattern='*'),
37573759
'loki': PlacementSpec(count=1),
37583760
'promtail': PlacementSpec(host_pattern='*'),
3761+
'alloy': PlacementSpec(host_pattern='*'),
37593762
'crash': PlacementSpec(host_pattern='*'),
37603763
'container': PlacementSpec(count=1),
37613764
'snmp-gateway': PlacementSpec(count=1),
@@ -3901,6 +3904,10 @@ def apply_loki(self, spec: ServiceSpec) -> str:
39013904
def apply_promtail(self, spec: ServiceSpec) -> str:
39023905
return self._apply(spec)
39033906

3907+
@handle_orch_error
3908+
def apply_alloy(self, spec: ServiceSpec) -> str:
3909+
return self._apply(spec)
3910+
39043911
@handle_orch_error
39053912
def apply_node_exporter(self, spec: ServiceSpec) -> str:
39063913
return self._apply(spec)

src/pybind/mgr/cephadm/services/monitoring.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -826,6 +826,43 @@ def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[st
826826
}, sorted(deps)
827827

828828

829+
@register_cephadm_service
830+
class AlloyService(CephadmService):
831+
TYPE = 'alloy'
832+
DEFAULT_SERVICE_PORT = 9080
833+
834+
@classmethod
835+
def get_dependencies(cls, mgr: "CephadmOrchestrator",
836+
spec: Optional[ServiceSpec] = None,
837+
daemon_type: Optional[str] = None) -> List[str]:
838+
return sorted(mgr.cache.get_daemons_by_types(['loki']))
839+
840+
def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
841+
assert self.TYPE == daemon_spec.daemon_type
842+
daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
843+
return daemon_spec
844+
845+
def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]:
846+
assert self.TYPE == daemon_spec.daemon_type
847+
daemons = self.mgr.cache.get_daemons_by_service('loki')
848+
loki_host = ''
849+
for i, dd in enumerate(daemons):
850+
assert dd.hostname is not None
851+
if i == 0:
852+
loki_host = dd.ip if dd.ip else self.mgr.get_fqdn(dd.hostname)
853+
854+
context = {
855+
'client_hostname': loki_host,
856+
}
857+
858+
alloy_config = self.mgr.template.render('services/alloy.j2', context)
859+
return {
860+
"files": {
861+
"config.alloy": alloy_config
862+
}
863+
}, self.get_dependencies(self.mgr)
864+
865+
829866
@register_cephadm_service
830867
class PromtailService(CephadmService):
831868
TYPE = 'promtail'
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
local.file_match "system" {
2+
path_targets = [{
3+
__address__ = "localhost",
4+
__path__ = "/var/log/ceph/**/*.log",
5+
job = "Cluster Logs",
6+
}]
7+
}
8+
9+
loki.source.file "system" {
10+
targets = local.file_match.system.targets
11+
forward_to = [loki.write.default.receiver]
12+
legacy_positions_file = "/var/lib/alloy/data/positions.yaml"
13+
}
14+
15+
loki.write "default" {
16+
endpoint {
17+
url = "http://{{ client_hostname }}:3100/loki/api/v1/push"
18+
}
19+
external_labels = {}
20+
}

src/pybind/mgr/cephadm/tests/test_services.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ def test_get_auth_entity(self):
143143

144144
# services based on CephadmService shouldn't have get_auth_entity
145145
with pytest.raises(AttributeError):
146-
for daemon_type in ['grafana', 'alertmanager', 'prometheus', 'node-exporter', 'loki', 'promtail']:
146+
for daemon_type in ['grafana', 'alertmanager', 'prometheus', 'node-exporter', 'loki', 'promtail', 'alloy']:
147147
service_registry.get_service(daemon_type).get_auth_entity("id1", "host")
148148
service_registry.get_service(daemon_type).get_auth_entity("id1", "")
149149
service_registry.get_service(daemon_type).get_auth_entity("id1")

src/pybind/mgr/cephadm/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ class CephadmNoImage(Enum):
2626
'rbd-mirror', 'cephfs-mirror', 'ceph-exporter']
2727
GATEWAY_TYPES = ['iscsi', 'nfs', 'nvmeof', 'smb']
2828
MONITORING_STACK_TYPES = ['node-exporter', 'prometheus',
29-
'alertmanager', 'grafana', 'loki', 'promtail']
29+
'alertmanager', 'grafana', 'loki', 'promtail', 'alloy']
3030
RESCHEDULE_FROM_OFFLINE_HOSTS_TYPES = ['haproxy', 'nfs']
3131

3232
CEPH_UPGRADE_ORDER = CEPH_TYPES + GATEWAY_TYPES + MONITORING_STACK_TYPES

0 commit comments

Comments
 (0)