Skip to content

Commit 674bcfb

Browse files
Add alert + kpi for knowledge status conditions
1 parent d58e977 commit 674bcfb

File tree

9 files changed

+376
-3
lines changed

9 files changed

+376
-3
lines changed

helm/bundles/cortex-cinder/alerts/cinder.alerts.yaml

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ groups:
159159
service will have a less recent view of the datacenter.
160160
161161
- alert: CortexCinderDatasourceUnready
162-
expr: cortex_datasource_state{operator="cortex-cinder",state=~"waiting|error|unknown"} != 0
162+
expr: cortex_datasource_state{operator="cortex-cinder",state!="ready"} != 0
163163
for: 60m
164164
labels:
165165
context: datasources
@@ -173,3 +173,19 @@ groups:
173173
This may indicate issues with the datasource
174174
connectivity or configuration. It is recommended to investigate the
175175
datasource status and logs for more details.
176+
177+
- alert: CortexCinderKnowledgeUnready
178+
expr: cortex_knowledge_state{operator="cortex-cinder",state!="ready"} != 0
179+
for: 60m
180+
labels:
181+
context: knowledge
182+
dashboard: cortex/cortex
183+
service: cortex
184+
severity: warning
185+
support_group: workload-management
186+
annotations:
187+
summary: "Knowledge `{{$labels.knowledge}}` is in `{{$labels.state}}` state"
188+
description: >
189+
This may indicate issues with the knowledge
190+
configuration. It is recommended to investigate the
191+
knowledge status and logs for more details.

helm/bundles/cortex-cinder/templates/kpis.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,15 @@ spec:
1010
datasourceOperator: cortex-cinder
1111
description: |
1212
This KPI tracks the state of datasource resources managed by cortex.
13+
---
14+
apiVersion: cortex.cloud/v1alpha1
15+
kind: KPI
16+
metadata:
17+
name: cortex-cinder-knowledge-state-kpi
18+
spec:
19+
operator: cortex-cinder
20+
impl: knowledge_state_kpi
21+
opts:
22+
knowledgeOperator: cortex-cinder
23+
description: |
24+
This KPI tracks the state of knowledge resources managed by cortex.

helm/bundles/cortex-manila/alerts/manila.alerts.yaml

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ groups:
159159
service will have a less recent view of the datacenter.
160160
161161
- alert: CortexManilaDatasourceUnready
162-
expr: cortex_datasource_state{operator="cortex-manila",state=~"waiting|error|unknown"} != 0
162+
expr: cortex_datasource_state{operator="cortex-manila",state!="ready"} != 0
163163
for: 60m
164164
labels:
165165
context: datasources
@@ -173,3 +173,19 @@ groups:
173173
This may indicate issues with the datasource
174174
connectivity or configuration. It is recommended to investigate the
175175
datasource status and logs for more details.
176+
177+
- alert: CortexManilaKnowledgeUnready
178+
expr: cortex_knowledge_state{operator="cortex-manila",state!="ready"} != 0
179+
for: 60m
180+
labels:
181+
context: knowledge
182+
dashboard: cortex/cortex
183+
service: cortex
184+
severity: warning
185+
support_group: workload-management
186+
annotations:
187+
summary: "Knowledge `{{$labels.knowledge}}` is in `{{$labels.state}}` state"
188+
description: >
189+
This may indicate issues with the knowledge
190+
configuration. It is recommended to investigate the
191+
knowledge status and logs for more details.

helm/bundles/cortex-manila/templates/kpis.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,15 @@ spec:
2323
datasourceOperator: cortex-manila
2424
description: |
2525
This KPI tracks the state of datasource resources managed by cortex.
26+
---
27+
apiVersion: cortex.cloud/v1alpha1
28+
kind: KPI
29+
metadata:
30+
name: cortex-manila-knowledge-state-kpi
31+
spec:
32+
operator: cortex-manila
33+
impl: knowledge_state_kpi
34+
opts:
35+
knowledgeOperator: cortex-manila
36+
description: |
37+
This KPI tracks the state of knowledge resources managed by cortex.

helm/bundles/cortex-nova/alerts/nova.alerts.yaml

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ groups:
175175
service will have a less recent view of the datacenter.
176176
177177
- alert: CortexNovaDatasourceUnready
178-
expr: cortex_datasource_state{operator="cortex-nova",state=~"waiting|error|unknown"} != 0
178+
expr: cortex_datasource_state{operator="cortex-nova",state!="ready"} != 0
179179
for: 60m
180180
labels:
181181
context: datasources
@@ -189,3 +189,19 @@ groups:
189189
This may indicate issues with the datasource
190190
connectivity or configuration. It is recommended to investigate the
191191
datasource status and logs for more details.
192+
193+
- alert: CortexNovaKnowledgeUnready
194+
expr: cortex_knowledge_state{operator="cortex-nova",state!="ready"} != 0
195+
for: 60m
196+
labels:
197+
context: knowledge
198+
dashboard: cortex/cortex
199+
service: cortex
200+
severity: warning
201+
support_group: workload-management
202+
annotations:
203+
summary: "Knowledge `{{$labels.knowledge}}` is in `{{$labels.state}}` state"
204+
description: >
205+
This may indicate issues with the knowledge
206+
configuration. It is recommended to investigate the
207+
knowledge status and logs for more details.

helm/bundles/cortex-nova/templates/kpis.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,3 +147,15 @@ spec:
147147
datasourceOperator: cortex-nova
148148
description: |
149149
This KPI tracks the state of datasource resources managed by cortex.
150+
---
151+
apiVersion: cortex.cloud/v1alpha1
152+
kind: KPI
153+
metadata:
154+
name: cortex-nova-knowledge-state-kpi
155+
spec:
156+
operator: cortex-nova
157+
impl: knowledge_state_kpi
158+
opts:
159+
knowledgeOperator: cortex-nova
160+
description: |
161+
This KPI tracks the state of knowledge resources managed by cortex.
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
// Copyright 2025 SAP SE
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package deployment
5+
6+
import (
7+
"context"
8+
9+
"github.com/cobaltcore-dev/cortex/api/v1alpha1"
10+
"github.com/cobaltcore-dev/cortex/internal/knowledge/kpis/plugins"
11+
"github.com/cobaltcore-dev/cortex/pkg/conf"
12+
"github.com/cobaltcore-dev/cortex/pkg/db"
13+
"github.com/prometheus/client_golang/prometheus"
14+
"k8s.io/apimachinery/pkg/api/meta"
15+
"sigs.k8s.io/controller-runtime/pkg/client"
16+
)
17+
18+
type KnowledgeStateKPIOpts struct {
19+
// The operator to filter knowledges by.
20+
KnowledgeOperator string `yaml:"knowledgeOperator"`
21+
}
22+
23+
// KPI observing the state of knowledge resources managed by cortex.
24+
type KnowledgeStateKPI struct {
25+
// Common base for all KPIs that provides standard functionality.
26+
plugins.BaseKPI[KnowledgeStateKPIOpts]
27+
28+
// Prometheus descriptor for the knowledge state metric.
29+
counter *prometheus.Desc
30+
}
31+
32+
func (KnowledgeStateKPI) GetName() string { return "knowledge_state_kpi" }
33+
34+
// Initialize the KPI.
35+
func (k *KnowledgeStateKPI) Init(db *db.DB, client client.Client, opts conf.RawOpts) error {
36+
if err := k.BaseKPI.Init(db, client, opts); err != nil {
37+
return err
38+
}
39+
k.counter = prometheus.NewDesc(
40+
"cortex_knowledge_state",
41+
"State of cortex managed knowledges",
42+
[]string{"operator", "knowledge", "state"},
43+
nil,
44+
)
45+
return nil
46+
}
47+
48+
// Conform to the prometheus collector interface by providing the descriptor.
49+
func (k *KnowledgeStateKPI) Describe(ch chan<- *prometheus.Desc) { ch <- k.counter }
50+
51+
// Collect the knowledge state metrics.
52+
func (k *KnowledgeStateKPI) Collect(ch chan<- prometheus.Metric) {
53+
// Get all knowledges with the specified knowledge operator.
54+
knowledgeList := &v1alpha1.KnowledgeList{}
55+
if err := k.Client.List(context.Background(), knowledgeList); err != nil {
56+
return
57+
}
58+
var knowledges []v1alpha1.Knowledge
59+
for _, ds := range knowledgeList.Items {
60+
if ds.Spec.Operator != k.Options.KnowledgeOperator {
61+
continue
62+
}
63+
knowledges = append(knowledges, ds)
64+
}
65+
// For each knowledge, emit a metric with its state.
66+
for _, ds := range knowledges {
67+
var state string
68+
switch {
69+
case meta.IsStatusConditionTrue(ds.Status.Conditions, v1alpha1.KnowledgeConditionError):
70+
state = "error"
71+
case ds.Status.IsReady():
72+
state = "ready"
73+
default:
74+
state = "unknown"
75+
}
76+
ch <- prometheus.MustNewConstMetric(
77+
k.counter, prometheus.GaugeValue, 1,
78+
k.Options.KnowledgeOperator, ds.Name, state,
79+
)
80+
}
81+
}

0 commit comments

Comments
 (0)