Skip to content

Commit 715b349

Browse files
authored
Add Hadoop Monitoring (#210)
* Added hadoop monitoring capabilities * Updated hadoop namespace in documentation * Applied project formatting * Updated units to match specification * Changed container creation in hadoop test * Fixed formatting in hadoop integration test * Updated metric names and descriptions * Updated tests and documentation for new hadoop names
1 parent a3167a5 commit 715b349

File tree

9 files changed

+358
-1
lines changed

9 files changed

+358
-1
lines changed

jmx-metrics/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ mutually exclusive with `otel.jmx.groovy.script`. The currently supported target
7171
| [`jvm`](./docs/target-systems/jvm.md) |
7272
| [`activemq`](./docs/target-systems/activemq.md) |
7373
| [`cassandra`](./docs/target-systems/cassandra.md) |
74+
| [`hadoop`](./docs/target-systems/hadoop.md) |
7475
| [`kafka`](./docs/target-systems/kafka.md) |
7576
| [`kafka-consumer`](./docs/target-systems/kafka-consumer.md) |
7677
| [`kafka-producer`](./docs/target-systems/kafka-producer.md) |
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# Hadoop Metrics
2+
3+
The JMX Metric Gatherer provides built in Hadoop metric gathering capabilities.
4+
These metrics are sourced from: https://hadoop.apache.org/docs/r2.7.2/hadoop-project-dist/hadoop-common/Metrics.html
5+
6+
### Name Node Metrics
7+
* Name: `hadoop.name_node.capacity.usage`
8+
* Description: The current used capacity across all data nodes reporting to the name node.
9+
* Unit: `by`
10+
* Labels: `node_name`
11+
* Instrument Type: LongUpDownCounterCallback
12+
13+
* Name: `hadoop.name_node.capacity.limit`
14+
* Description: The total capacity allotted to data nodes reporting to the name node.
15+
* Unit: `by`
16+
* Labels: `node_name`
17+
* Instrument Type: LongUpDownCounterCallback
18+
19+
* Name: `hadoop.name_node.block.count`
20+
* Description: The total number of blocks on the name node.
21+
* Unit: `{blocks}`
22+
* Labels: `node_name`
23+
* Instrument Type: LongUpDownCounterCallback
24+
25+
* Name: `hadoop.name_node.block.missing`
26+
* Description: The number of blocks reported as missing to the name node.
27+
* Unit: `{blocks}`
28+
* Labels: `node_name`
29+
* Instrument Type: LongUpDownCounterCallback
30+
31+
* Name: `hadoop.name_node.block.corrupt`
32+
* Description: The number of blocks reported as corrupt to the name node.
33+
* Unit: `{blocks}`
34+
* Labels: `node_name`
35+
* Instrument Type: LongUpDownCounterCallback
36+
37+
* Name: `hadoop.name_node.volume.failed`
38+
* Description: The number of failed volumes reported to the name node.
39+
* Unit: `{volumes}`
40+
* Labels: `node_name`
41+
* Instrument Type: LongUpDownCounterCallback
42+
43+
* Name: `hadoop.name_node.file.count`
44+
* Description: The total number of files being tracked by the name node.
45+
* Unit: `{files}`
46+
* Labels: `node_name`
47+
* Instrument Type: LongUpDownCounterCallback
48+
49+
* Name: `hadoop.name_node.file.load`
50+
* Description: The current number of concurrent file accesses.
51+
* Unit: `{operations}`
52+
* Labels: `node_name`
53+
* Instrument Type: LongUpDownCounterCallback
54+
55+
* Name: `hadoop.name_node.data_node.count`
56+
* Description: The number of data nodes reporting to the name node.
57+
* Unit: `{nodes}`
58+
* Labels: `node_name`, `state`
59+
* Instrument Type: LongUpDownCounterCallback
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
/*
2+
* Copyright The OpenTelemetry Authors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package io.opentelemetry.contrib.jmxmetrics.target_systems;
7+
8+
import static org.assertj.core.api.Assertions.entry;
9+
10+
import io.opentelemetry.contrib.jmxmetrics.AbstractIntegrationTest;
11+
import java.time.Duration;
12+
import org.junit.jupiter.api.Test;
13+
import org.testcontainers.containers.GenericContainer;
14+
import org.testcontainers.containers.Network;
15+
import org.testcontainers.containers.wait.strategy.Wait;
16+
import org.testcontainers.junit.jupiter.Container;
17+
import org.testcontainers.utility.MountableFile;
18+
19+
class HadoopIntegrationTest extends AbstractIntegrationTest {
20+
21+
HadoopIntegrationTest() {
22+
super(/* configFromStdin= */ false, "target-systems/hadoop.properties");
23+
}
24+
25+
@Container
26+
GenericContainer<?> hadoop =
27+
new GenericContainer<>("bmedora/hadoop:2.9-base")
28+
.withCopyFileToContainer(
29+
MountableFile.forClasspathResource("hadoop/hadoop-env.sh", 0400),
30+
"/hadoop/etc/hadoop/hadoop-env.sh")
31+
.withCopyFileToContainer(
32+
MountableFile.forClasspathResource("hadoop/yarn-site.xml", 0400),
33+
"/hadoop/etc/hadoop/yarn-site.xml")
34+
.withNetwork(Network.SHARED)
35+
.withNetworkAliases("hadoop")
36+
.withExposedPorts(8004)
37+
.withStartupTimeout(Duration.ofMinutes(2))
38+
.withCreateContainerCmdModifier(cmd -> cmd.withHostName("test-host"))
39+
.waitingFor(Wait.forListeningPort());
40+
41+
@Test
42+
void endToEnd() {
43+
waitAndAssertMetrics(
44+
metric ->
45+
assertSumWithAttributes(
46+
metric,
47+
"hadoop.name_node.capacity.usage",
48+
"The current used capacity across all data nodes reporting to the name node.",
49+
"by",
50+
attrs -> attrs.contains(entry("node_name", "test-host"))),
51+
metric ->
52+
assertSumWithAttributes(
53+
metric,
54+
"hadoop.name_node.capacity.limit",
55+
"The total capacity allotted to data nodes reporting to the name node.",
56+
"by",
57+
attrs -> attrs.containsOnly(entry("node_name", "test-host"))),
58+
metric ->
59+
assertSumWithAttributes(
60+
metric,
61+
"hadoop.name_node.block.count",
62+
"The total number of blocks on the name node.",
63+
"{blocks}",
64+
attrs -> attrs.containsOnly(entry("node_name", "test-host"))),
65+
metric ->
66+
assertSumWithAttributes(
67+
metric,
68+
"hadoop.name_node.block.missing",
69+
"The number of blocks reported as missing to the name node.",
70+
"{blocks}",
71+
attrs -> attrs.containsOnly(entry("node_name", "test-host"))),
72+
metric ->
73+
assertSumWithAttributes(
74+
metric,
75+
"hadoop.name_node.block.corrupt",
76+
"The number of blocks reported as corrupt to the name node.",
77+
"{blocks}",
78+
attrs -> attrs.containsOnly(entry("node_name", "test-host"))),
79+
metric ->
80+
assertSumWithAttributes(
81+
metric,
82+
"hadoop.name_node.volume.failed",
83+
"The number of failed volumes reported to the name node.",
84+
"{volumes}",
85+
attrs -> attrs.containsOnly(entry("node_name", "test-host"))),
86+
metric ->
87+
assertSumWithAttributes(
88+
metric,
89+
"hadoop.name_node.file.count",
90+
"The total number of files being tracked by the name node.",
91+
"{files}",
92+
attrs -> attrs.containsOnly(entry("node_name", "test-host"))),
93+
metric ->
94+
assertSumWithAttributes(
95+
metric,
96+
"hadoop.name_node.file.load",
97+
"The current number of concurrent file accesses.",
98+
"{operations}",
99+
attrs -> attrs.containsOnly(entry("node_name", "test-host"))),
100+
metric ->
101+
assertSumWithAttributes(
102+
metric,
103+
"hadoop.name_node.data_node.count",
104+
"The number of data nodes reporting to the name node.",
105+
"{nodes}",
106+
attrs ->
107+
attrs.containsOnly(entry("node_name", "test-host"), entry("state", "live")),
108+
attrs ->
109+
attrs.containsOnly(entry("node_name", "test-host"), entry("state", "dead"))));
110+
}
111+
}
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
# Set Hadoop-specific environment variables here.
18+
19+
# The only required environment variable is JAVA_HOME. All others are
20+
# optional. When running a distributed configuration it is best to
21+
# set JAVA_HOME in this file, so that it is correctly defined on
22+
# remote nodes.
23+
24+
# The java implementation to use.
25+
export JAVA_HOME=${JAVA_HOME}
26+
27+
# The jsvc implementation to use. Jsvc is required to run secure datanodes
28+
# that bind to privileged ports to provide authentication of data transfer
29+
# protocol. Jsvc is not required if SASL is configured for authentication of
30+
# data transfer protocol using non-privileged ports.
31+
#export JSVC_HOME=${JSVC_HOME}
32+
33+
export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-"/etc/hadoop"}
34+
35+
# Extra Java CLASSPATH elements. Automatically insert capacity-scheduler.
36+
for f in $HADOOP_HOME/contrib/capacity-scheduler/*.jar; do
37+
if [ "$HADOOP_CLASSPATH" ]; then
38+
export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$f
39+
else
40+
export HADOOP_CLASSPATH=$f
41+
fi
42+
done
43+
44+
# The maximum amount of heap to use, in MB. Default is 1000.
45+
#export HADOOP_HEAPSIZE=
46+
#export HADOOP_NAMENODE_INIT_HEAPSIZE=""
47+
48+
# Enable extra debugging of Hadoop's JAAS binding, used to set up
49+
# Kerberos security.
50+
# export HADOOP_JAAS_DEBUG=true
51+
52+
# Extra Java runtime options. Empty by default.
53+
# For Kerberos debugging, an extended option set logs more invormation
54+
# export HADOOP_OPTS="-Djava.net.preferIPv4Stack=true -Dsun.security.krb5.debug=true -Dsun.security.spnego.debug"
55+
export HADOOP_OPTS="$HADOOP_OPTS -Djava.net.preferIPv4Stack=true"
56+
57+
# Command specific options appended to HADOOP_OPTS when specified
58+
export HADOOP_NAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_NAMENODE_OPTS"
59+
export HADOOP_NAMENODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_NAMENODE_OPTS"
60+
export HADOOP_NAMENODE_OPTS="$HADOOP_NAMENODE_OPTS -Dcom.sun.management.jmxremote.authenticate=false"
61+
export HADOOP_NAMENODE_OPTS="$HADOOP_NAMENODE_OPTS -Dcom.sun.management.jmxremote.ssl=false"
62+
export HADOOP_NAMENODE_OPTS="$HADOOP_NAMENODE_OPTS -Dcom.sun.management.jmxremote.port=8004 -Dcom.sun.management.jmxremote.rmi.port=8004"
63+
64+
export HADOOP_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS $HADOOP_DATANODE_OPTS"
65+
66+
export HADOOP_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_SECONDARYNAMENODE_OPTS"
67+
68+
export HADOOP_NFS3_OPTS="$HADOOP_NFS3_OPTS"
69+
export HADOOP_PORTMAP_OPTS="-Xmx512m $HADOOP_PORTMAP_OPTS"
70+
71+
# The following applies to multiple commands (fs, dfs, fsck, distcp etc)
72+
export HADOOP_CLIENT_OPTS="$HADOOP_CLIENT_OPTS"
73+
# set heap args when HADOOP_HEAPSIZE is empty
74+
if [ "$HADOOP_HEAPSIZE" = "" ]; then
75+
export HADOOP_CLIENT_OPTS="-Xmx512m $HADOOP_CLIENT_OPTS"
76+
fi
77+
#HADOOP_JAVA_PLATFORM_OPTS="-XX:-UsePerfData $HADOOP_JAVA_PLATFORM_OPTS"
78+
79+
# On secure datanodes, user to run the datanode as after dropping privileges.
80+
# This **MUST** be uncommented to enable secure HDFS if using privileged ports
81+
# to provide authentication of data transfer protocol. This **MUST NOT** be
82+
# defined if SASL is configured for authentication of data transfer protocol
83+
# using non-privileged ports.
84+
export HADOOP_SECURE_DN_USER=${HADOOP_SECURE_DN_USER}
85+
86+
# Where log files are stored. $HADOOP_HOME/logs by default.
87+
#export HADOOP_LOG_DIR=${HADOOP_LOG_DIR}/$USER
88+
89+
# Where log files are stored in the secure data environment.
90+
#export HADOOP_SECURE_DN_LOG_DIR=${HADOOP_LOG_DIR}/${HADOOP_HDFS_USER}
91+
92+
###
93+
# HDFS Mover specific parameters
94+
###
95+
# Specify the JVM options to be used when starting the HDFS Mover.
96+
# These options will be appended to the options specified as HADOOP_OPTS
97+
# and therefore may override any similar flags set in HADOOP_OPTS
98+
#
99+
# export HADOOP_MOVER_OPTS=""
100+
101+
###
102+
# Advanced Users Only!
103+
###
104+
105+
# The directory where pid files are stored. /tmp by default.
106+
# NOTE: this should be set to a directory that can only be written to by
107+
# the user that will run the hadoop daemons. Otherwise there is the
108+
# potential for a symlink attack.
109+
export HADOOP_PID_DIR=${HADOOP_PID_DIR}
110+
export HADOOP_SECURE_DN_PID_DIR=${HADOOP_PID_DIR}
111+
112+
# A string representing this instance of hadoop. $USER by default.
113+
export HADOOP_IDENT_STRING=$USER
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
<?xml version="1.0"?>
2+
<configuration>
3+
<property>
4+
<name>yarn.nodemanager.aux-services</name>
5+
<value>mapreduce_shuffle</value>
6+
</property>
7+
<property>
8+
<name>yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage</name>
9+
<value>100</value>
10+
</property>
11+
<property>
12+
<name>yarn.nodemanager.resource.cpu-vcores</name>
13+
<value>2</value>
14+
</property>
15+
<property>
16+
<name>yarn.nodemanager.resource.memory-mb</name>
17+
<value>4096</value>
18+
</property>
19+
</configuration>
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
otel.jmx.interval.milliseconds = 3000
2+
otel.metrics.exporter = otlp
3+
otel.jmx.service.url = service:jmx:rmi:///jndi/rmi://hadoop:8004/jmxrmi
4+
otel.jmx.target.system = hadoop
5+
6+
# these will be overridden by cmd line
7+
otel.exporter.otlp.endpoint = http://host.testcontainers.internal

jmx-metrics/src/main/groovy/io/opentelemetry/contrib/jmxmetrics/JmxConfig.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ class JmxConfig {
3636
Arrays.asList(
3737
"activemq",
3838
"cassandra",
39+
"hadoop",
3940
"jvm",
4041
"kafka",
4142
"kafka-consumer",
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
/*
2+
* Copyright The OpenTelemetry Authors
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
def beanHadoopNameNodeFS = otel.mbean("Hadoop:service=NameNode,name=FSNamesystem")
18+
otel.instrument(beanHadoopNameNodeFS, "hadoop.name_node.capacity.usage", "The current used capacity across all data nodes reporting to the name node.", "by",
19+
["node_name" : { mbean -> mbean.getProperty("tag.Hostname") }],
20+
"CapacityUsed", otel.&longUpDownCounterCallback)
21+
otel.instrument(beanHadoopNameNodeFS, "hadoop.name_node.capacity.limit", "The total capacity allotted to data nodes reporting to the name node.", "by",
22+
["node_name" : { mbean -> mbean.getProperty("tag.Hostname") }],
23+
"CapacityTotal", otel.&longUpDownCounterCallback)
24+
otel.instrument(beanHadoopNameNodeFS, "hadoop.name_node.block.count", "The total number of blocks on the name node.", "{blocks}",
25+
["node_name" : { mbean -> mbean.getProperty("tag.Hostname") }],
26+
"BlocksTotal", otel.&longUpDownCounterCallback)
27+
otel.instrument(beanHadoopNameNodeFS, "hadoop.name_node.block.missing", "The number of blocks reported as missing to the name node.", "{blocks}",
28+
["node_name" : { mbean -> mbean.getProperty("tag.Hostname") }],
29+
"MissingBlocks", otel.&longUpDownCounterCallback)
30+
otel.instrument(beanHadoopNameNodeFS, "hadoop.name_node.block.corrupt", "The number of blocks reported as corrupt to the name node.", "{blocks}",
31+
["node_name" : { mbean -> mbean.getProperty("tag.Hostname") }],
32+
"CorruptBlocks", otel.&longUpDownCounterCallback)
33+
otel.instrument(beanHadoopNameNodeFS, "hadoop.name_node.volume.failed", "The number of failed volumes reported to the name node.", "{volumes}",
34+
["node_name" : { mbean -> mbean.getProperty("tag.Hostname") }],
35+
"VolumeFailuresTotal", otel.&longUpDownCounterCallback)
36+
otel.instrument(beanHadoopNameNodeFS, "hadoop.name_node.file.count", "The total number of files being tracked by the name node.", "{files}",
37+
["node_name" : { mbean -> mbean.getProperty("tag.Hostname") }],
38+
"TotalFiles", otel.&longUpDownCounterCallback)
39+
otel.instrument(beanHadoopNameNodeFS, "hadoop.name_node.file.load", "The current number of concurrent file accesses.", "{operations}",
40+
["node_name" : { mbean -> mbean.getProperty("tag.Hostname") }],
41+
"TotalLoad", otel.&longUpDownCounterCallback)
42+
otel.instrument(beanHadoopNameNodeFS, "hadoop.name_node.data_node.count", "The number of data nodes reporting to the name node.", "{nodes}",
43+
["node_name" : { mbean -> mbean.getProperty("tag.Hostname") }],
44+
["NumLiveDataNodes":["state":{"live"}], "NumDeadDataNodes": ["state":{"dead"}]],
45+
otel.&longUpDownCounterCallback)

jmx-metrics/src/test/java/io/opentelemetry/contrib/jmxmetrics/JmxConfigTest.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ void staticValues() {
2020
.containsOnly(
2121
"activemq",
2222
"cassandra",
23+
"hadoop",
2324
"jvm",
2425
"kafka",
2526
"kafka-consumer",
@@ -122,7 +123,7 @@ void invalidTargetSystem() {
122123
assertThatThrownBy(config::validate)
123124
.isInstanceOf(ConfigurationException.class)
124125
.hasMessage(
125-
"[jvm, unavailabletargetsystem] must specify targets from [activemq, cassandra, jvm, "
126+
"[jvm, unavailabletargetsystem] must specify targets from [activemq, cassandra, hadoop, jvm, "
126127
+ "kafka, kafka-consumer, kafka-producer, solr, tomcat]");
127128
}
128129

0 commit comments

Comments
 (0)