|
| 1 | +# Fetch metrics from the built-in Prometheus endpoint of HDFS components. |
| 2 | + |
| 3 | +import logging |
| 4 | +import re |
| 5 | +import sys |
| 6 | + |
| 7 | +import requests |
| 8 | + |
| 9 | + |
| 10 | +def check_metrics( |
| 11 | + namespace: str, role: str, port: int, expected_metrics: list[str] |
| 12 | +) -> None: |
| 13 | + response: requests.Response = requests.get( |
| 14 | + f"http://hdfs-{role}-default-0.hdfs-{role}-default.{namespace}.svc.cluster.local:{port}/prom", |
| 15 | + timeout=10, |
| 16 | + ) |
| 17 | + assert response.ok, "Requesting metrics failed" |
| 18 | + |
| 19 | + for metric in expected_metrics: |
| 20 | + assert re.search(f"^{metric}", response.text, re.MULTILINE) is not None, ( |
| 21 | + f"Metric '{metric}' not found for {role}" |
| 22 | + ) |
| 23 | + |
| 24 | + |
| 25 | +def check_namenode_metrics( |
| 26 | + namespace: str, |
| 27 | + product_version: str, |
| 28 | +) -> None: |
| 29 | + expected_metrics: list[str] = [ |
| 30 | + # Kind "MetricsSystem" |
| 31 | + 'metrics_system_num_active_sources{context="metricssystem",hostname="hdfs-namenode-default-0"}', |
| 32 | + # Counter suffixed with "_total" |
| 33 | + # The metric attributes can change so use .* for them. |
| 34 | + # The full name looks like: 'fs_namesystem_files_total{context="dfs",enabledecpolicies="RS-6-3-1024k",hastate="active",totalsynctimes="4 7 ",hostname="hdfs-namenode-default-0"}', |
| 35 | + "fs_namesystem_files_total.*", |
| 36 | + # Metric suffixed with "_created" |
| 37 | + 'namenode_files_created{processname="NameNode",sessionid="null",context="dfs",hostname="hdfs-namenode-default-0"}', |
| 38 | + # Boolean metric |
| 39 | + # 'hadoop_namenode_security_enabled{kind="NameNodeStatus",role="NameNode",service="HDFS"}', |
| 40 | + # Non-special metric |
| 41 | + 'namenode_files_deleted{processname="NameNode",sessionid="null",context="dfs",hostname="hdfs-namenode-default-0"}', |
| 42 | + ] |
| 43 | + |
| 44 | + check_metrics(namespace, "namenode", 9870, expected_metrics) |
| 45 | + |
| 46 | + |
| 47 | +def check_datanode_metrics( |
| 48 | + namespace: str, |
| 49 | + product_version: str, |
| 50 | +) -> None: |
| 51 | + expected_metrics: list[str] = [ |
| 52 | + # Kind "MetricsSystem" |
| 53 | + 'metrics_system_num_active_sources{context="metricssystem",hostname="hdfs-datanode-default-0"}', |
| 54 | + # Kind "FSDatasetState" suffixed with "_total" |
| 55 | + # 'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total{context="FSDatasetState",storageinfo="FSDataset{dirpath=\'[/stackable/data/hdd/datanode,/stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'}",hostname="hdfs-datanode-default-0"}', |
| 56 | + "org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total.*", |
| 57 | + # Kind "FSDatasetState" |
| 58 | + # 'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_capacity{context="FSDatasetState",storageinfo="FSDataset{dirpath=\'[/stackable/data/hdd/datanode, /stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'}",hostname="hdfs-datanode-default-0"}', |
| 59 | + "org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_capacity.*", |
| 60 | + # Kind "DataNodeActivity" suffixed with "_info" |
| 61 | + 'datanode_blocks_get_local_path_info{sessionid="null",context="dfs",hostname="hdfs-datanode-default-0"}', |
| 62 | + # Kind "DataNodeActivity" |
| 63 | + 'datanode_blocks_read{sessionid="null",context="dfs",hostname="hdfs-datanode-default-0"}', |
| 64 | + # Counter suffixed with "_total" |
| 65 | + # 'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total{context="FSDatasetState",storageinfo="FSDataset{dirpath=\'[/stackable/data/hdd/datanode,/stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'}",hostname="hdfs-datanode-default-0"}', |
| 66 | + "org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total.*", |
| 67 | + # Boolean metric |
| 68 | + #'hadoop_datanode_security_enabled{kind="DataNodeInfo",role="DataNode",service="HDFS"}', |
| 69 | + # Non-special metric |
| 70 | + 'jvm_metrics_gc_count{context="jvm",processname="DataNode",sessionid="null",hostname="hdfs-datanode-default-0"}', |
| 71 | + ] |
| 72 | + |
| 73 | + check_metrics(namespace, "datanode", 9864, expected_metrics) |
| 74 | + |
| 75 | + |
| 76 | +def check_journalnode_metrics( |
| 77 | + namespace: str, |
| 78 | + product_version: str, |
| 79 | +) -> None: |
| 80 | + expected_metrics: list[str] = [ |
| 81 | + # Kind "MetricsSystem" |
| 82 | + 'metrics_system_num_active_sources{context="metricssystem",hostname="hdfs-journalnode-default-0"}', |
| 83 | + # Non-special metric |
| 84 | + 'journal_node_bytes_written{context="dfs",journalid="hdfs",hostname="hdfs-journalnode-default-0"}', |
| 85 | + # There is no boolean metric in JournalNode. |
| 86 | + ] |
| 87 | + |
| 88 | + check_metrics(namespace, "journalnode", 8480, expected_metrics) |
| 89 | + |
| 90 | + |
| 91 | +if __name__ == "__main__": |
| 92 | + namespace_arg: str = sys.argv[1] |
| 93 | + product_version_arg: str = sys.argv[2] |
| 94 | + |
| 95 | + logging.basicConfig( |
| 96 | + level="DEBUG", |
| 97 | + format="%(asctime)s %(levelname)s: %(message)s", |
| 98 | + stream=sys.stdout, |
| 99 | + ) |
| 100 | + |
| 101 | + check_namenode_metrics(namespace_arg, product_version_arg) |
| 102 | + check_datanode_metrics(namespace_arg, product_version_arg) |
| 103 | + check_journalnode_metrics(namespace_arg, product_version_arg) |
| 104 | + |
| 105 | + print("All expected metrics found") |
0 commit comments