[filesystem] Support use Hadoop dependencies from environment variables HADOOP_CLASSPATH

beryllw · beryllw · commit 3740dff5bfff · 2025-07-17T20:01:38.000+08:00
diff --git a/fluss-dist/src/main/resources/bin/config.sh b/fluss-dist/src/main/resources/bin/config.sh
@@ -28,6 +28,12 @@ constructFlussClassPath() {
         else
             FLUSS_CLASSPATH="$FLUSS_CLASSPATH":"$jarfile"
         fi
+
+    # Add Hadoop dependencies from environment variables HADOOP_CLASSPATH
+    if [ -n "${HADOOP_CLASSPATH}" ]; then
+        FLUSS_CLASSPATH="$FLUSS_CLASSPATH":"$HADOOP_CLASSPATH"
+    fi
+
     done < <(find "$FLUSS_LIB_DIR" ! -type d -name '*.jar' -print0 | sort -z)
 
     local FLUSS_SERVER_COUNT
@@ -131,6 +137,7 @@ KEY_ENV_SSH_OPTS="env.ssh.opts"
 KEY_ZK_HEAP_MB="zookeeper.heap.mb"
 
 KEY_REMOTE_DATA_DIR="remote.data.dir"
+KEY_ENV_HADOOP_CLASSPATH="env.hadoop.class-path"
 
 ########################################################################################################################
 # PATHS AND CONFIG
@@ -285,6 +292,10 @@ if [ -z "${REMOTE_DATA_DIR}" ]; then
     REMOTE_DATA_DIR=$(readFromConfig ${KEY_REMOTE_DATA_DIR} "" "${YAML_CONF}")
 fi
 
+if [ -z "${HADOOP_CLASSPATH}" ]; then
+    HADOOP_CLASSPATH=$(readFromConfig ${KEY_ENV_HADOOP_CLASSPATH} "" "${YAML_CONF}")
+fi
+
 # Arguments for the JVM. Used for Coordinator server and Tablet server JVMs.
 if [ -z "${JVM_ARGS}" ]; then
     JVM_ARGS=""
diff --git a/fluss-server/pom.xml b/fluss-server/pom.xml
@@ -131,6 +131,12 @@
                                     <include>*:*</include>
                                 </includes>
                             </artifactSet>
+                            <relocations>
+                                <relocation>
+                                    <pattern>org.apache.commons</pattern>
+                                    <shadedPattern>org.apache.fluss.shaded.org.apache.commons</shadedPattern>
+                                </relocation>
+                            </relocations>
                         </configuration>
                     </execution>
                 </executions>
diff --git a/website/docs/maintenance/filesystems/hdfs.md b/website/docs/maintenance/filesystems/hdfs.md
@@ -36,6 +36,20 @@ remote.data.dir: hdfs://namenode:50010/path/to/remote/storage
 To allow for easy adoption, you can use the same configuration keys in Fluss' server.yaml as in Hadoop's `core-site.xml`.
 You can see the configuration keys in Hadoop's [`core-site.xml`](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/core-default.xml).
 
+#### Hadoop Environment Configuration
+
+To use the machine hadoop environment, instead of Fluss' embedded Hadoop, follow these steps:
+
+**Step 1: Set Hadoop Classpath**
+```bash
+export HADOOP_CLASSPATH=`hadoop classpath`
+```
+
+**Step 2: Add the following to your configuration file**
+```yaml
+plugin.classloader.parent-first-patterns.default: java.,com.alibaba.fluss.,javax.annotation.,org.slf4j,org.apache.log4j,org.apache.logging,org.apache.commons.logging,ch.qos.logback,hdfs-site,core-site,org.apache.hadoop.,META-INF
+```
+
 
 
 
diff --git a/website/docs/maintenance/tiered-storage/lakehouse-storage.md b/website/docs/maintenance/tiered-storage/lakehouse-storage.md
@@ -47,6 +47,35 @@ datalake.paimon.metastore: filesystem
 datalake.paimon.warehouse: /tmp/paimon_data_warehouse
 ```
 
+#### Hadoop Environment Configuration
+
+To use the machine hadoop environment, instead of Fluss' embedded Hadoop, follow these steps:
+
+**Step 1: Set Hadoop Classpath**
+```bash
+export HADOOP_CLASSPATH=`hadoop classpath`
+```
+
+**Step 2: Add the following to your configuration file**
+```yaml
+plugin.classloader.parent-first-patterns.default: java.,com.alibaba.fluss.,javax.annotation.,org.slf4j,org.apache.log4j,org.apache.logging,org.apache.commons.logging,ch.qos.logback,hdfs-site,core-site,org.apache.hadoop.,META-INF
+```
+
+#### Hive Catalog Configuration
+
+To use Hive as the metastore, follow these steps:  
+
+**Step 1: Add Hive Connector Dependency**  
+[Download](https://nightlies.apache.org/flink/flink-docs-stable/docs/connectors/table/hive/overview/#using-bundled-hive-jar) the Flink SQL Hive Client JAR.Place the downloaded JAR in Paimon's plugin directory:
+`$PAIMON_HOME/plugins/hive`.
+
+**Step 2: Add the following to your configuration file**  
+```yaml
+datalake.paimon.metastore: hive
+# this is recommended in the kerberos environment
+datalake.paimon.hive-conf-dir: '...',
+```
+
 ### Start The Datalake Tiering Service
 Then, you must start the datalake tiering service to compact Fluss's data to the lakehouse storage.
 To start the datalake tiering service, you must have a Flink cluster running since Fluss currently only supports Flink as a tiering service backend.