sysdiglabs
diff --git a/‎apps/kubernetes-control-plane.yaml‎
Lines changed: 1 addition & 0 deletions b/‎apps/kubernetes-control-plane.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎resources/consul/INSTALL.md‎
Lines changed: 11 additions & 0 deletions b/‎resources/consul/INSTALL.md‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎resources/consul/alerts.yaml‎
Lines changed: 3 additions & 3 deletions b/‎resources/consul/alerts.yaml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎resources/consul/include/consul_sysdig.json‎
Lines changed: 4 additions & 4 deletions b/‎resources/consul/include/consul_sysdig.json‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎resources/consul/include/sysdig-agent.yaml‎
Lines changed: 4 additions & 12 deletions b/‎resources/consul/include/sysdig-agent.yaml‎
Lines changed: 4 additions & 12 deletions
diff --git a/‎resources/kubernetes-control/ALERTSv1.22.md‎
Lines changed: 112 additions & 0 deletions b/‎resources/kubernetes-control/ALERTSv1.22.md‎
Lines changed: 112 additions & 0 deletions
diff --git a/‎resources/kubernetes-control/INSTALLv1.22.md‎
Lines changed: 26 additions & 0 deletions b/‎resources/kubernetes-control/INSTALLv1.22.md‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎resources/kubernetes-control/READMEv1.22.md‎
Lines changed: 19 additions & 0 deletions b/‎resources/kubernetes-control/READMEv1.22.md‎
Lines changed: 19 additions & 0 deletions
@@ -8,6 +8,7 @@ keywords:
 availableVersions: 
   - '1.14.0'
   - '1.18.0'
+  - '1.22.0'
 shortDescription: Open-source system for automating deployment, scaling, and management of containerized applications.
 description: |
   # Kubernetes (K8s) control plane monitor. 
 
@@ -1,2 +1,13 @@
 # Prerequisites
 Consul instruments Prometheus metrics and annotates the pods with Prometheus annotations. 
+
+As seen in Consul documentation pages (https://www.consul.io/docs/k8s/helm#v-global-metrics and https://www.consul.io/docs/agent/options#telemetry-prometheus_retention_time), to make Consul expose an endpoint for scraping metrics, you need to enable a few global.metrics configurations.
+You also need to enable the telemetry.disable_hostname "extra configurations" in the Consul Server and Client, so the metrics don't contain the name of the instances.
+
+If you install Consul with Helm, you need to use the following flags:
+```
+--set 'global.metrics.enabled=true'
+--set 'global.metrics.enableAgentMetrics=true'
+--set 'server.extraConfig="{"telemetry": {"disable_hostname": true}}"'
+--set 'client.extraConfig="{"telemetry": {"disable_hostname": true}}"'
+```
@@ -74,15 +74,15 @@ configurations:
           description: There are too many elections for leadership."
       - alert: Server cluster unhealthy
         expr: |
-          consul_per_server_autopilot_healthy == 0
+          consul_autopilot_healthy == 0
         for: 5m
         labels:
           severity: high
         annotations:
           description: One or many Consul servers in the cluster are unhealthy.
       - alert: Zero failure tolerance
         expr: |
-          consul_per_server_autopilot_failure_tolerance == 0
+          consul_autopilot_failure_tolerance == 0
         for: 5m
         labels:
           severity: medium
@@ -138,7 +138,7 @@ configurations:
           description: Garbage Collection stop-the-world pauses were greater than 5 seconds per minute.
       - alert: Raft restore duration too high
         expr: |
-          consul_per_server_raft_leader_oldestLogAge < 2* max(consul_raft_fsm_lastRestoreDuration{kube_pod_label_component="server"})
+          consul_raft_leader_oldestLogAge < 2* max(consul_raft_fsm_lastRestoreDuration{kube_pod_label_component="server"})
         for: 5m
         labels:
           severity: medium
 
@@ -456,7 +456,7 @@
               "unit": "number",
               "yAxis": "auto"
             },
-            "query": "min(consul_per_server_autopilot_healthy{kube_cluster_name=~$cluster, kube_namespace_name=~$namespace})"
+            "query": "min(consul_autopilot_healthy{kube_cluster_name=~$cluster, kube_namespace_name=~$namespace})"
           }
         ],
         "description": "",
@@ -655,7 +655,7 @@
               "unit": "number",
               "yAxis": "auto"
             },
-            "query": "consul_per_server_autopilot_failure_tolerance{kube_cluster_name=~$cluster, kube_namespace_name=~$namespace}"
+            "query": "consul_autopilot_failure_tolerance{kube_cluster_name=~$cluster, kube_namespace_name=~$namespace}"
           }
         ],
         "axesConfiguration": {
@@ -718,7 +718,7 @@
               "unit": "number",
               "yAxis": "auto"
             },
-            "query": "consul_per_server_autopilot_healthy{kube_cluster_name=~$cluster, kube_namespace_name=~$namespace}"
+            "query": "consul_autopilot_healthy{kube_cluster_name=~$cluster, kube_namespace_name=~$namespace}"
           }
         ],
         "axesConfiguration": {
@@ -1433,7 +1433,7 @@
               "unit": "relativeTime",
               "yAxis": "auto"
             },
-            "query": "consul_per_server_raft_leader_oldestLogAge{kube_cluster_name=~$cluster, kube_namespace_name=~$namespace} > 0\n"
+            "query": "consul_raft_leader_oldestLogAge{kube_cluster_name=~$cluster, kube_namespace_name=~$namespace} > 0\n"
           },
           {
             "displayInfo": {
 
@@ -29,6 +29,9 @@ data:
       scrape_interval: 10s
     scrape_configs:
     - job_name: 'consul-envoy-default'
+      metrics_path: '/v1/agent/metrics'
+      params:
+        format: ['prometheus']
       tls_config:
         insecure_skip_verify: true
       kubernetes_sd_configs:
@@ -102,20 +105,9 @@ data:
       - action: keep
         source_labels: [__address__]
         regex: (.*:8500)
-      - action: replace
-        source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
-        target_label: __metrics_path__
-        regex: (.+)
-        replacement: '/v1/agent/metrics'
       - action: replace
         source_labels: [__meta_kubernetes_pod_uid]
         target_label: sysdig_k8s_pod_uid
       - action: replace
         source_labels: [__meta_kubernetes_pod_container_name]
-        target_label: sysdig_k8s_pod_container_name
-      metric_relabel_configs:
-      # Change the name of the metric to remove the name of the pod
-      - source_labels: ['__name__']
-        target_label: '__name__'
-        regex: '(consul_)([a-z]+_)+[0-9]+_(.+)'
-        replacement: ${1}per_server_${3}
+        target_label: sysdig_k8s_pod_container_name
@@ -0,0 +1,112 @@
+# Alerts
+## [KubeProxy] Kube Proxy Down
+KubeProxy detected down
+
+## [KubeProxy] High Rest Client Latency
+High Rest Client Latency detected
+
+## [KubeProxy] High Rule Sync Latency
+High Rule Sync Latency detected
+
+## [KubeProxy] Too Many 500 Code
+Too Many 500 Code detected
+
+## [CoreDNS] Error High
+High Request Duration
+
+## [CoreDNS] Latency High
+Latency High
+
+## [Etcd] Etcd Members Down
+There are members down.
+
+## [Etcd] Etcd Insufficient Members
+Etcd cluster has insufficient members
+
+## [Etcd] Etcd No Leader
+Member has no leader.
+
+## [Etcd] Etcd High Number Of Leader Changes
+Leader changes within the last 15 minutes.
+
+## [Etcd] Etcd High Number Of Failed GRPC Requests
+High number of failed grpc requests
+
+## [Etcd] Etcd GRPC Requests Slow
+gRPC requests are taking too much time
+
+## [Etcd] Etcd High Number Of Failed Proposals
+High number of proposal failures within the last 30 minutes on etcd instance
+
+## [Etcd] Etcd High Fsync Durations
+99th percentile fync durations are too high
+
+## [Etcd] Etcd High Commit Durations
+99th percentile commit durations are too high
+
+## [Etcd] Etcd HighNumber Of Failed HTTP Requests
+High number of failed http requests
+
+## [Etcd] Etcd HTTP Requests Slow
+Https request are slow
+
+## [Kubelet] PV Not Available
+Persistent Volume not available
+
+## [Kubelet] High Storage Error Rate
+High Storage Error Rate
+
+## [Kubelet] High Storage Latency
+High Storage Latency
+
+## [Kubernetes Api Server] Deprecated APIs
+API-Server Deprecated APIs
+
+## [Kubernetes Api Server] Certificate Expiry
+API-Server Certificate Expiry
+
+## [Kubernetes Api Server] Admission Controller High Latency
+API-Server Admission Controller High Latency
+
+## [Kubernetes Api Server] Webhook Admission Controller High Latency
+API-Server Webhook Admission Controller High Latency
+
+## [Kubernetes Api Server] High 4xx RequestError Rate
+APIS-Server High 4xx Request Error Rate
+
+## [Kubernetes Api Server] High 5xx RequestError Rate
+APIS-Server High 5xx Request Error Rate
+
+## [Kubernetes Api Server] High Request Latency
+APIS-Server High Request Latency
+
+## [k8s-kubelet] Kubelet Too Many Pods
+Kubelet Too Many Pods
+
+## [k8s-kubelet] Kubelet Pod Lifecycle Event Generator Duration High
+Kubelet Pod Lifecycle Event Generator Duration High
+
+## [k8s-kubelet] Kubelet Pod StartUp Latency High
+Kubelet Pod StartUp Latency High
+
+## [k8s-kubelet] Kubelet Down
+Kubelet Down
+
+## [k8s-pvc] PV Not Available
+Persistent Volume not available
+
+## [k8s-pvc] PVC Pending For a Long Time
+Persistent Volume Claim not available
+
+## [k8s-pvc] PVC Lost
+Persistent Volume Claim lost
+
+## [k8s-pvc] PVC Storage Usage Is Reaching The Limit
+Persistent Volume Claim storage at 95%
+
+## [k8s-pvc] PVC Inodes Usage Is Reaching The Limit
+PVC inodes Usage Is Reaching The Limit
+
+## [k8s-pvc] PV Full In Four Days
+Persistent Volume Full In Four Days
+
@@ -0,0 +1,26 @@
+##  Mount the etcd certificates in the sysdig agent
+```sh
+kubectl -n sysdig-agent patch ds sysdig-agent -p '{"spec":{"template":{"spec":{"volumes":[{"hostPath":{"path":"/etc/kubernetes/pki/etcd-manager-main","type":"DirectoryOrCreate"},"name":"etcd-certificates"}]}}}}'
+  
+kubectl -n sysdig-agent patch ds sysdig-agent -p '{"spec":{"template":{"spec":{"containers":[{"name":"sysdig-agent","volumeMounts": [{"mountPath": "/etc/kubernetes/pki/etcd-manager","name": "etcd-certificates"}]}]}}}}'
+```
+
+# Exposing the Proxy port in kops
+If you are using kops, you will have to change the cluster spec to expose the port for the proxy. To edit the cluster, run:
+
+```
+kops --state s3://name-of-s3 --name cluster-name edit cluster
+```
+
+And add the following lines:
+
+```yaml
+kubeProxy:
+  metricsBindAddress: 0.0.0.0
+```
+
+And update the cluster:
+
+```
+kops --state s3://name-of-s3 --name cluster-name rolling-update cluster --yes
+```
@@ -0,0 +1,19 @@
+# Kubernetes
+Kubernetes (K8s) is an open-source system for automating deployment, scaling, and management of containerized applications.
+
+The metrics for the information of kubernetes control plane are gathered from the pods located in the namespace kube-system.
+
+# Metrics
+With this metrics we can see the information about:
+- Api-server
+- Kubelet
+- Control manager
+- Scheduler
+- Proxy
+- etcd
+- coreDNS
+
+# Attributions
+Configuration files and dashboards maintained by [Sysdig team](https://sysdig.com/).
+
+All dashboards and alerts are modified from the [kubernetes mixin](https://github.com/kubernetes-monitoring/kubernetes-mixin) as reference.