Skip to content

Commit deae3b9

Browse files
committed
add test and update docs
1 parent 11d6276 commit deae3b9

File tree

4 files changed

+116
-0
lines changed

4 files changed

+116
-0
lines changed

docs/modules/hdfs/pages/usage-guide/monitoring.adoc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,12 @@ The cluster can be monitored with Prometheus from inside or outside the K8S clus
66
All services (with the exception of the Zookeeper daemon on the node names) run with the JMX exporter agent enabled and expose metrics on the `metrics` port.
77
This port is available from the container level up to the NodePort services.
88

9+
[IMPORTANT]
10+
====
11+
Starting with Stackable Data Platform 25.7, the bult-in Prometheus metrics are also available at the `/prom` endpoint of all the UI services.
12+
The JMX exporter metrics are now deprecated and will be removed in a future release.
13+
====
14+
915
The metrics endpoints are also used as liveliness probes by Kubernetes.
1016

1117
See xref:operators:monitoring.adoc[] for more details.

tests/templates/kuttl/smoke/51-assert.yaml.j2

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,9 @@ commands:
88
{% else %}
99
PRODUCT_VERSION={{ test_scenario['values']['hadoop'] }}
1010
{% endif %}
11+
# Test JMX exported metrics
1112
kubectl exec --namespace=$NAMESPACE test-runner-0 -- \
1213
python /tmp/test_metrics.py $NAMESPACE $PRODUCT_VERSION
14+
# Test Prometheus metrics
15+
kubectl exec --namespace=$NAMESPACE test-runner-0 -- \
16+
python /tmp/test_prometheus_metrics.py $NAMESPACE $PRODUCT_VERSION

tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@ apiVersion: kuttl.dev/v1beta1
33
kind: TestStep
44
commands:
55
- script: kubectl cp -n $NAMESPACE ./test_metrics.py test-runner-0:/tmp
6+
- script: kubectl cp -n $NAMESPACE ./test_prometheus_metrics.py test-runner-0:/tmp
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
# Fetch metrics from the built-in Prometheus endpoint of HDFS components.
2+
3+
import logging
4+
import re
5+
import sys
6+
7+
import requests
8+
9+
10+
def check_metrics(
11+
namespace: str, role: str, port: int, expected_metrics: list[str]
12+
) -> None:
13+
response: requests.Response = requests.get(
14+
f"http://hdfs-{role}-default-0.hdfs-{role}-default.{namespace}.svc.cluster.local:{port}/prom",
15+
timeout=10,
16+
)
17+
assert response.ok, "Requesting metrics failed"
18+
19+
for metric in expected_metrics:
20+
assert re.search(f"^{metric}", response.text, re.MULTILINE) is not None, (
21+
f"Metric '{metric}' not found for {role}"
22+
)
23+
24+
25+
def check_namenode_metrics(
26+
namespace: str,
27+
product_version: str,
28+
) -> None:
29+
expected_metrics: list[str] = [
30+
# Kind "MetricsSystem"
31+
'metrics_system_num_active_sources{context="metricssystem",hostname="hdfs-namenode-default-0"}',
32+
# Counter suffixed with "_total"
33+
# The metric attributes can change so use .* for them.
34+
# The full name looks like: 'fs_namesystem_files_total{context="dfs",enabledecpolicies="RS-6-3-1024k",hastate="active",totalsynctimes="4 7 ",hostname="hdfs-namenode-default-0"}',
35+
"fs_namesystem_files_total.*",
36+
# Metric suffixed with "_created"
37+
'namenode_files_created{processname="NameNode",sessionid="null",context="dfs",hostname="hdfs-namenode-default-0"}',
38+
# Boolean metric
39+
# 'hadoop_namenode_security_enabled{kind="NameNodeStatus",role="NameNode",service="HDFS"}',
40+
# Non-special metric
41+
'namenode_files_deleted{processname="NameNode",sessionid="null",context="dfs",hostname="hdfs-namenode-default-0"}',
42+
]
43+
44+
check_metrics(namespace, "namenode", 9870, expected_metrics)
45+
46+
47+
def check_datanode_metrics(
48+
namespace: str,
49+
product_version: str,
50+
) -> None:
51+
expected_metrics: list[str] = [
52+
# Kind "MetricsSystem"
53+
'metrics_system_num_active_sources{context="metricssystem",hostname="hdfs-datanode-default-0"}',
54+
# Kind "FSDatasetState" suffixed with "_total"
55+
# 'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total{context="FSDatasetState",storageinfo="FSDataset{dirpath=\'[/stackable/data/hdd/datanode,/stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'}",hostname="hdfs-datanode-default-0"}',
56+
"org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total.*",
57+
# Kind "FSDatasetState"
58+
# 'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_capacity{context="FSDatasetState",storageinfo="FSDataset{dirpath=\'[/stackable/data/hdd/datanode, /stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'}",hostname="hdfs-datanode-default-0"}',
59+
"org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_capacity.*",
60+
# Kind "DataNodeActivity" suffixed with "_info"
61+
'datanode_blocks_get_local_path_info{sessionid="null",context="dfs",hostname="hdfs-datanode-default-0"}',
62+
# Kind "DataNodeActivity"
63+
'datanode_blocks_read{sessionid="null",context="dfs",hostname="hdfs-datanode-default-0"}',
64+
# Counter suffixed with "_total"
65+
# 'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total{context="FSDatasetState",storageinfo="FSDataset{dirpath=\'[/stackable/data/hdd/datanode,/stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'}",hostname="hdfs-datanode-default-0"}',
66+
"org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total.*",
67+
# Boolean metric
68+
#'hadoop_datanode_security_enabled{kind="DataNodeInfo",role="DataNode",service="HDFS"}',
69+
# Non-special metric
70+
'jvm_metrics_gc_count{context="jvm",processname="DataNode",sessionid="null",hostname="hdfs-datanode-default-0"}',
71+
]
72+
73+
check_metrics(namespace, "datanode", 9864, expected_metrics)
74+
75+
76+
def check_journalnode_metrics(
77+
namespace: str,
78+
product_version: str,
79+
) -> None:
80+
expected_metrics: list[str] = [
81+
# Kind "MetricsSystem"
82+
'metrics_system_num_active_sources{context="metricssystem",hostname="hdfs-journalnode-default-0"}',
83+
# Non-special metric
84+
'journal_node_bytes_written{context="dfs",journalid="hdfs",hostname="hdfs-journalnode-default-0"}',
85+
# There is no boolean metric in JournalNode.
86+
]
87+
88+
check_metrics(namespace, "journalnode", 8480, expected_metrics)
89+
90+
91+
if __name__ == "__main__":
92+
namespace_arg: str = sys.argv[1]
93+
product_version_arg: str = sys.argv[2]
94+
95+
logging.basicConfig(
96+
level="DEBUG",
97+
format="%(asctime)s %(levelname)s: %(message)s",
98+
stream=sys.stdout,
99+
)
100+
101+
check_namenode_metrics(namespace_arg, product_version_arg)
102+
check_datanode_metrics(namespace_arg, product_version_arg)
103+
check_journalnode_metrics(namespace_arg, product_version_arg)
104+
105+
print("All expected metrics found")

0 commit comments

Comments
 (0)