Skip to content

Commit 463f966

Browse files
labrenbesbernauer
andauthored
Add support for Hadoop 3.4.0 (#743)
* wip: add hadoop 3.4.0 * Use correct boost package * build hdfs utils from source * fix hdfs-utils build * use hfs utils 0.3.0 * fix Dockerfile * fix jmx exporter version * Update hadoop/Dockerfile Co-authored-by: Sebastian Bernauer <[email protected]> --------- Co-authored-by: Sebastian Bernauer <[email protected]> Co-authored-by: Sebastian Bernauer <[email protected]>
1 parent bbdd7ee commit 463f966

File tree

6 files changed

+508
-20
lines changed

6 files changed

+508
-20
lines changed

hadoop/Dockerfile

Lines changed: 27 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,14 @@ ARG PRODUCT
1111
ARG ASYNC_PROFILER
1212
ARG JMX_EXPORTER
1313
ARG PROTOBUF
14-
ARG TOPOLOGY_PROVIDER
14+
ARG HDFS_UTILS
1515
ARG TARGETARCH
1616
ARG TARGETOS
1717

1818
WORKDIR /stackable
1919

20-
COPY hadoop/stackable /stackable
21-
20+
COPY hadoop/stackable/jmx /stackable/jmx
21+
COPY hadoop/stackable/fuse_dfs_wrapper /stackable/fuse_dfs_wrapper
2222

2323
# The symlink from JMX Exporter 0.16.1 to the versionless link exists because old HDFS Operators (up until and including 23.7) used to hardcode
2424
# the version of JMX Exporter like this: "-javaagent:/stackable/jmx/jmx_prometheus_javaagent-0.16.1.jar"
@@ -49,8 +49,31 @@ RUN curl --fail -L -s -S https://repo.stackable.tech/repository/packages/protobu
4949
ENV PROTOBUF_HOME /opt/protobuf
5050
ENV PATH "${PATH}:/opt/protobuf/bin"
5151

52+
RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm
53+
RUN microdnf update && \
54+
microdnf install \
55+
# boost is a build dependency starting in Hadoop 3.4.0 if compiling native code
56+
boost1.78-devel && \
57+
microdnf clean all && \
58+
rm -rf /var/cache/yum
59+
5260
WORKDIR /stackable
5361

62+
# The Stackable HDFS utils contain an OPA authorizer, group mapper & topology provider.
63+
# The topology provider provides rack awareness functionality for HDFS by allowing users to specify Kubernetes
64+
# labels to build a rackID from.
65+
# Starting with hdfs-utils version 0.3.0 the topology provider is not a standalone jar anymore and included in hdfs-utils.
66+
67+
RUN curl --fail -L "https://github.com/stackabletech/hdfs-utils/archive/refs/tags/v${HDFS_UTILS}.tar.gz" | tar -xzC . && \
68+
cd hdfs-utils-${HDFS_UTILS} && \
69+
mvn clean package -P hadoop-${PRODUCT} -DskipTests -Dmaven.javadoc.skip=true && \
70+
mkdir -p /stackable/hadoop-${PRODUCT}/share/hadoop/common/lib && \
71+
cp target/hdfs-utils-$HDFS_UTILS.jar /stackable/hadoop-${PRODUCT}/share/hadoop/common/lib/hdfs-utils-${HDFS_UTILS}.jar && \
72+
rm -rf /stackable/hdfs-utils-main && \
73+
cd -
74+
75+
COPY hadoop/stackable/patches /stackable/patches
76+
5477
# Hadoop Pipes requires libtirpc to build, whose headers are not packaged in RedHat UBI, so skip building this module
5578
# Build from source to enable FUSE module, and to apply custom patches.
5679
# Also skip building the yarn, mapreduce and minicluster modules: this will result in the modules being excluded but not all
@@ -82,13 +105,10 @@ COPY shared/log4shell_scanner /bin/log4shell_scanner
82105
RUN /bin/log4shell_scanner s "/stackable/hadoop-${PRODUCT}"
83106
# ===
84107

85-
# Final Image
86-
FROM stackable/image/java-base
108+
FROM stackable/image/java-base as final
87109

88110
ARG PRODUCT
89111
ARG RELEASE
90-
ARG TOPOLOGY_PROVIDER
91-
ARG HDFS_UTILS
92112

93113
LABEL name="Apache Hadoop" \
94114
maintainer="[email protected]" \
@@ -125,15 +145,6 @@ COPY --chown=stackable:stackable --from=builder /stackable/jmx /stackable/jmx/
125145
COPY --chown=stackable:stackable --from=builder /stackable/async-profiler /stackable/async-profiler/
126146
RUN ln -s /stackable/hadoop-${PRODUCT} /stackable/hadoop
127147

128-
# The topology provider provides rack awareness functionality for HDFS by allowing users to specify Kubernetes
129-
# labels to build a rackID from
130-
# source code is at: https://github.com/stackabletech/hdfs-topology-provider
131-
# N.B. the artifact name changed from 0.2.0 onwards i.e. from topology-provider-0.1.0.jar to hdfs-topology-provider-0.2.0.jar
132-
RUN curl --fail -L -s -S https://repo.stackable.tech/repository/packages/hdfs-topology-provider/hdfs-topology-provider-${TOPOLOGY_PROVIDER}.jar -o /stackable/hadoop/share/hadoop/common/lib/hdfs-topology-provider-${TOPOLOGY_PROVIDER}.jar
133-
134-
# The Stackable HDFS utils contain an OPA authorizer and group mapper
135-
RUN curl --fail -L -s -S https://repo.stackable.tech/repository/packages/hdfs-utils/hdfs-utils-${HDFS_UTILS}.jar -o /stackable/hadoop/share/hadoop/common/lib/hdfs-utils-${HDFS_UTILS}.jar
136-
137148
COPY hadoop/stackable/fuse_dfs_wrapper /stackable/hadoop/bin
138149

139150
ENV HOME=/stackable
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
diff --git a/hadoop-project/pom.xml b/hadoop-project/pom.xml
2+
index 0ed96d087bc..9ebb6af4567 100644
3+
--- a/hadoop-project/pom.xml
4+
+++ b/hadoop-project/pom.xml
5+
@@ -217,7 +217,7 @@
6+
<jsonschema2pojo.version>1.0.2</jsonschema2pojo.version>
7+
<woodstox.version>5.4.0</woodstox.version>
8+
<nimbus-jose-jwt.version>9.31</nimbus-jose-jwt.version>
9+
- <nodejs.version>v12.22.1</nodejs.version>
10+
+ <nodejs.version>v14.0.0</nodejs.version>
11+
<yarnpkg.version>v1.22.5</yarnpkg.version>
12+
<apache-ant.version>1.10.13</apache-ant.version>
13+
<jmh.version>1.20</jmh.version>
Lines changed: 247 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,247 @@
1+
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java
2+
index e3f4bfcde84..3d65bcad229 100755
3+
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java
4+
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java
5+
@@ -147,6 +147,13 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
6+
public static final boolean DFS_DATANODE_DROP_CACHE_BEHIND_READS_DEFAULT = false;
7+
public static final String DFS_DATANODE_USE_DN_HOSTNAME = "dfs.datanode.use.datanode.hostname";
8+
public static final boolean DFS_DATANODE_USE_DN_HOSTNAME_DEFAULT = false;
9+
+
10+
+ public static final String DFS_DATANODE_REGISTERED_HOSTNAME = "dfs.datanode.registered.hostname";
11+
+ public static final String DFS_DATANODE_REGISTERED_DATA_PORT = "dfs.datanode.registered.port";
12+
+ public static final String DFS_DATANODE_REGISTERED_HTTP_PORT = "dfs.datanode.registered.http.port";
13+
+ public static final String DFS_DATANODE_REGISTERED_HTTPS_PORT = "dfs.datanode.registered.https.port";
14+
+ public static final String DFS_DATANODE_REGISTERED_IPC_PORT = "dfs.datanode.registered.ipc.port";
15+
+
16+
public static final String DFS_DATANODE_MAX_LOCKED_MEMORY_KEY = "dfs.datanode.max.locked.memory";
17+
public static final long DFS_DATANODE_MAX_LOCKED_MEMORY_DEFAULT = 0;
18+
public static final String DFS_DATANODE_FSDATASETCACHE_MAX_THREADS_PER_VOLUME_KEY = "dfs.datanode.fsdatasetcache.max.threads.per.volume";
19+
@@ -454,6 +461,8 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
20+
public static final long DFS_DATANODE_PROCESS_COMMANDS_THRESHOLD_DEFAULT =
21+
TimeUnit.SECONDS.toMillis(2);
22+
23+
+ public static final String DFS_NAMENODE_DATANODE_REGISTRATION_UNSAFE_ALLOW_ADDRESS_OVERRIDE_KEY = "dfs.namenode.datanode.registration.unsafe.allow-address-override";
24+
+ public static final boolean DFS_NAMENODE_DATANODE_REGISTRATION_UNSAFE_ALLOW_ADDRESS_OVERRIDE_DEFAULT = false;
25+
public static final String DFS_NAMENODE_DATANODE_REGISTRATION_IP_HOSTNAME_CHECK_KEY = "dfs.namenode.datanode.registration.ip-hostname-check";
26+
public static final boolean DFS_NAMENODE_DATANODE_REGISTRATION_IP_HOSTNAME_CHECK_DEFAULT = true;
27+
28+
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java
29+
index 07381fc696f..8aeb92cff11 100644
30+
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java
31+
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java
32+
@@ -180,6 +180,8 @@ public class DatanodeManager {
33+
private boolean hasClusterEverBeenMultiRack = false;
34+
35+
private final boolean checkIpHostnameInRegistration;
36+
+ private final boolean allowRegistrationAddressOverride;
37+
+
38+
/**
39+
* Whether we should tell datanodes what to cache in replies to
40+
* heartbeat messages.
41+
@@ -316,6 +318,11 @@ public class DatanodeManager {
42+
// Block invalidate limit also has some dependency on heartbeat interval.
43+
// Check setBlockInvalidateLimit().
44+
setBlockInvalidateLimit(configuredBlockInvalidateLimit);
45+
+ this.allowRegistrationAddressOverride = conf.getBoolean(
46+
+ DFSConfigKeys.DFS_NAMENODE_DATANODE_REGISTRATION_UNSAFE_ALLOW_ADDRESS_OVERRIDE_KEY,
47+
+ DFSConfigKeys.DFS_NAMENODE_DATANODE_REGISTRATION_UNSAFE_ALLOW_ADDRESS_OVERRIDE_DEFAULT);
48+
+ LOG.info(DFSConfigKeys.DFS_NAMENODE_DATANODE_REGISTRATION_UNSAFE_ALLOW_ADDRESS_OVERRIDE_KEY
49+
+ + "=" + allowRegistrationAddressOverride);
50+
this.checkIpHostnameInRegistration = conf.getBoolean(
51+
DFSConfigKeys.DFS_NAMENODE_DATANODE_REGISTRATION_IP_HOSTNAME_CHECK_KEY,
52+
DFSConfigKeys.DFS_NAMENODE_DATANODE_REGISTRATION_IP_HOSTNAME_CHECK_DEFAULT);
53+
@@ -1138,27 +1145,29 @@ void startAdminOperationIfNecessary(DatanodeDescriptor nodeReg) {
54+
*/
55+
public void registerDatanode(DatanodeRegistration nodeReg)
56+
throws DisallowedDatanodeException, UnresolvedTopologyException {
57+
- InetAddress dnAddress = Server.getRemoteIp();
58+
- if (dnAddress != null) {
59+
- // Mostly called inside an RPC, update ip and peer hostname
60+
- String hostname = dnAddress.getHostName();
61+
- String ip = dnAddress.getHostAddress();
62+
- if (checkIpHostnameInRegistration && !isNameResolved(dnAddress)) {
63+
- // Reject registration of unresolved datanode to prevent performance
64+
- // impact of repetitive DNS lookups later.
65+
- final String message = "hostname cannot be resolved (ip="
66+
- + ip + ", hostname=" + hostname + ")";
67+
- LOG.warn("Unresolved datanode registration: " + message);
68+
- throw new DisallowedDatanodeException(nodeReg, message);
69+
+ if (!allowRegistrationAddressOverride) {
70+
+ InetAddress dnAddress = Server.getRemoteIp();
71+
+ if (dnAddress != null) {
72+
+ // Mostly called inside an RPC, update ip and peer hostname
73+
+ String hostname = dnAddress.getHostName();
74+
+ String ip = dnAddress.getHostAddress();
75+
+ if (checkIpHostnameInRegistration && !isNameResolved(dnAddress)) {
76+
+ // Reject registration of unresolved datanode to prevent performance
77+
+ // impact of repetitive DNS lookups later.
78+
+ final String message = "hostname cannot be resolved (ip="
79+
+ + ip + ", hostname=" + hostname + ")";
80+
+ LOG.warn("Unresolved datanode registration: " + message);
81+
+ throw new DisallowedDatanodeException(nodeReg, message);
82+
+ }
83+
+ // update node registration with the ip and hostname from rpc request
84+
+ nodeReg.setIpAddr(ip);
85+
+ nodeReg.setPeerHostName(hostname);
86+
}
87+
- // update node registration with the ip and hostname from rpc request
88+
- nodeReg.setIpAddr(ip);
89+
- nodeReg.setPeerHostName(hostname);
90+
}
91+
-
92+
+
93+
try {
94+
nodeReg.setExportedKeys(blockManager.getBlockKeys());
95+
-
96+
+
97+
// Checks if the node is not on the hosts list. If it is not, then
98+
// it will be disallowed from registering.
99+
if (!hostConfigManager.isIncluded(nodeReg)) {
100+
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DNConf.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DNConf.java
101+
index 9b5343321d3..790d508e5ea 100644
102+
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DNConf.java
103+
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DNConf.java
104+
@@ -100,6 +100,11 @@ public class DNConf {
105+
final boolean syncOnClose;
106+
final boolean encryptDataTransfer;
107+
final boolean connectToDnViaHostname;
108+
+ private final String registeredHostname;
109+
+ private final int registeredDataPort;
110+
+ private final int registeredHttpPort;
111+
+ private final int registeredHttpsPort;
112+
+ private final int registeredIpcPort;
113+
final boolean overwriteDownstreamDerivedQOP;
114+
private final boolean pmemCacheRecoveryEnabled;
115+
116+
@@ -188,6 +193,11 @@ public DNConf(final Configurable dn) {
117+
connectToDnViaHostname = getConf().getBoolean(
118+
DFSConfigKeys.DFS_DATANODE_USE_DN_HOSTNAME,
119+
DFSConfigKeys.DFS_DATANODE_USE_DN_HOSTNAME_DEFAULT);
120+
+ registeredHostname = getConf().get(DFSConfigKeys.DFS_DATANODE_REGISTERED_HOSTNAME);
121+
+ registeredDataPort = getConf().getInt(DFSConfigKeys.DFS_DATANODE_REGISTERED_DATA_PORT, -1);
122+
+ registeredHttpPort = getConf().getInt(DFSConfigKeys.DFS_DATANODE_REGISTERED_HTTP_PORT, -1);
123+
+ registeredHttpsPort = getConf().getInt(DFSConfigKeys.DFS_DATANODE_REGISTERED_HTTPS_PORT, -1);
124+
+ registeredIpcPort = getConf().getInt(DFSConfigKeys.DFS_DATANODE_REGISTERED_IPC_PORT, -1);
125+
this.blockReportInterval = getConf().getLong(
126+
DFS_BLOCKREPORT_INTERVAL_MSEC_KEY,
127+
DFS_BLOCKREPORT_INTERVAL_MSEC_DEFAULT);
128+
@@ -362,6 +372,66 @@ public boolean getConnectToDnViaHostname() {
129+
return connectToDnViaHostname;
130+
}
131+
132+
+ /**
133+
+ * Returns a hostname to register with the cluster instead of the system
134+
+ * hostname.
135+
+ * This is an expert setting and can be used in multihoming scenarios to
136+
+ * override the detected hostname.
137+
+ *
138+
+ * @return null if the system hostname should be used, otherwise a hostname
139+
+ */
140+
+ public String getRegisteredHostname() {
141+
+ return registeredHostname;
142+
+ }
143+
+
144+
+ /**
145+
+ * Returns a port number to register with the cluster instead of the
146+
+ * data port that the node is listening on.
147+
+ * This is an expert setting and can be used in multihoming scenarios to
148+
+ * override the detected port.
149+
+ *
150+
+ * @return -1 if the actual port should be used, otherwise a port number
151+
+ */
152+
+ public int getRegisteredDataPort() {
153+
+ return registeredDataPort;
154+
+ }
155+
+
156+
+ /**
157+
+ * Returns a port number to register with the cluster instead of the
158+
+ * HTTP port that the node is listening on.
159+
+ * This is an expert setting and can be used in multihoming scenarios to
160+
+ * override the detected port.
161+
+ *
162+
+ * @return -1 if the actual port should be used, otherwise a port number
163+
+ */
164+
+ public int getRegisteredHttpPort() {
165+
+ return registeredHttpPort;
166+
+ }
167+
+
168+
+ /**
169+
+ * Returns a port number to register with the cluster instead of the
170+
+ * HTTPS port that the node is listening on.
171+
+ * This is an expert setting and can be used in multihoming scenarios to
172+
+ * override the detected port.
173+
+ *
174+
+ * @return -1 if the actual port should be used, otherwise a port number
175+
+ */
176+
+ public int getRegisteredHttpsPort() {
177+
+ return registeredHttpsPort;
178+
+ }
179+
+
180+
+ /**
181+
+ * Returns a port number to register with the cluster instead of the
182+
+ * IPC port that the node is listening on.
183+
+ * This is an expert setting and can be used in multihoming scenarios to
184+
+ * override the detected port.
185+
+ *
186+
+ * @return -1 if the actual port should be used, otherwise a port number
187+
+ */
188+
+ public int getRegisteredIpcPort() {
189+
+ return registeredIpcPort;
190+
+ }
191+
+
192+
/**
193+
* Returns socket timeout
194+
*
195+
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java
196+
index 96c4ad9ae28..fdb8e631dc8 100644
197+
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java
198+
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNode.java
199+
@@ -117,6 +117,7 @@
200+
import java.util.Iterator;
201+
import java.util.List;
202+
import java.util.Map;
203+
+import java.util.Optional;
204+
import java.util.Map.Entry;
205+
import java.util.Set;
206+
import java.util.UUID;
207+
@@ -1876,11 +1877,35 @@ DatanodeRegistration createBPRegistration(NamespaceInfo nsInfo) {
208+
NodeType.DATA_NODE);
209+
}
210+
211+
- DatanodeID dnId = new DatanodeID(
212+
- streamingAddr.getAddress().getHostAddress(), hostName,
213+
- storage.getDatanodeUuid(), getXferPort(), getInfoPort(),
214+
- infoSecurePort, getIpcPort());
215+
- return new DatanodeRegistration(dnId, storageInfo,
216+
+ String registeredHostname = Optional
217+
+ .ofNullable(dnConf.getRegisteredHostname())
218+
+ .orElseGet(() -> streamingAddr.getAddress().getHostAddress());
219+
+ int registeredDataPort = dnConf.getRegisteredDataPort();
220+
+ if (registeredDataPort == -1) {
221+
+ registeredDataPort = getXferPort();
222+
+ }
223+
+ int registeredHttpPort = dnConf.getRegisteredHttpPort();
224+
+ if (registeredHttpPort == -1) {
225+
+ registeredHttpPort = getInfoPort();
226+
+ }
227+
+ int registeredHttpsPort = dnConf.getRegisteredHttpsPort();
228+
+ if (registeredHttpsPort == -1) {
229+
+ registeredHttpsPort = getInfoSecurePort();
230+
+ }
231+
+ int registeredIpcPort = dnConf.getRegisteredIpcPort();
232+
+ if (registeredIpcPort == -1) {
233+
+ registeredIpcPort = getIpcPort();
234+
+ }
235+
+
236+
+ DatanodeID dnId = new DatanodeID(registeredHostname,
237+
+ registeredHostname,
238+
+ storage.getDatanodeUuid(),
239+
+ registeredDataPort,
240+
+ registeredHttpPort,
241+
+ registeredHttpsPort,
242+
+ registeredIpcPort);
243+
+
244+
+ return new DatanodeRegistration(dnId, storageInfo,
245+
new ExportedBlockKeys(), VersionInfo.getVersion());
246+
}
247+
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/http/ProfileServlet.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/http/ProfileServlet.java
2+
index fc0ec7736ed8..e324ad6d49fd 100644
3+
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/http/ProfileServlet.java
4+
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/http/ProfileServlet.java
5+
@@ -76,6 +76,7 @@
6+
* Following event types are supported (default is 'cpu') (NOTE: not all OS'es support all events)
7+
* // Perf events:
8+
* // cpu
9+
+ * // itimer
10+
* // page-faults
11+
* // context-switches
12+
* // cycles
13+
@@ -115,6 +116,7 @@ public class ProfileServlet extends HttpServlet {
14+
private enum Event {
15+
16+
CPU("cpu"),
17+
+ ITIMER("itimer"),
18+
ALLOC("alloc"),
19+
LOCK("lock"),
20+
PAGE_FAULTS("page-faults"),

0 commit comments

Comments
 (0)