Skip to content

Inserting records yields java.lang.ClassNotFoundException: Class org.apache.hadoop.fs.s3a.S3AFileSystem not found #18

@santurini

Description

@santurini

I am using this image as a standalone hive metastore for trino.
It works perfectly fine when reading tables stored in MinIO, creating external tables from Parquet files in MinIO and creating schemas.
But as soon as I try an insert operation or more in general a write operation that should create parquet files in MinIO I get this error:

trino> create schema test.db with (location='s3a://test/db/');
CREATE SCHEMA

trino> create table test.db.table (currency varchar, date varchar, rate varchar) with (external_location='s3a://test/db/table/', format='PARQUET');
CREATE TABLE

trino> INSERT INTO test.db.table VALUES ('USD', '2024-02-15', '1.25');
Query 20250217_141259_00010_retkt failed: java.lang.ClassNotFoundException: Class org.apache.hadoop.fs.s3a.S3AFileSystem not found

This is the Dockerfile I am using at the moment:

FROM openjdk:16-slim

ARG HADOOP_VERSION=3.3.0
ARG HIVE_METASTORE_VERSION=3.0.0
ARG POSTGRES_CONNECTOR_VERSION=42.2.18

# Set necessary environment variables.
ENV HADOOP_HOME="/opt/hadoop"
ENV PATH="/opt/spark/bin:/opt/hadoop/bin:${PATH}"
ENV DATABASE_DRIVER=org.postgresql.Driver
ENV DATABASE_TYPE=postgres
ENV DATABASE_TYPE_JDBC=postgresql
ENV DATABASE_PORT=5432
ENV HADOOP_CLASSPATH=${HADOOP_HOME}/share/hadoop/common/lib/*:${HADOOP_HOME}/share/hadoop/common/*:${HADOOP_HOME}/share/hadoop/tools/lib/*

WORKDIR /app
SHELL ["/bin/bash", "-o", "pipefail", "-c"]

# hadolint ignore=DL3008,SC2086
# Install OS dependencies
RUN apt-get update -y && \
    apt-get install -y curl net-tools --no-install-recommends

COPY hadoop-${HADOOP_VERSION} /opt/hadoop-${HADOOP_VERSION}
COPY apache-hive-metastore-${HIVE_METASTORE_VERSION}-bin /opt/apache-hive-metastore-${HIVE_METASTORE_VERSION}-bin
COPY log4j-web-2.17.1.jar /opt/log4j-web-2.17.1.jar
COPY postgresql-${POSTGRES_CONNECTOR_VERSION}.jar /opt/postgresql-${POSTGRES_CONNECTOR_VERSION}.jar

# Fix 'java.lang.NoSuchMethodError: com.google.common.base.Preconditions.checkArgument'
# Keep this until this lands: https://issues.apache.org/jira/browse/HIVE-22915
RUN rm /opt/apache-hive-metastore-$HIVE_METASTORE_VERSION-bin/lib/guava-19.0.jar && \
    cp /opt/hadoop-$HADOOP_VERSION/share/hadoop/hdfs/lib/guava-27.0-jre.jar /opt/apache-hive-metastore-$HIVE_METASTORE_VERSION-bin/lib/

# Create symlinks to simplify configuration
RUN ln -s /opt/hadoop-$HADOOP_VERSION /opt/hadoop && \
    ln -s /opt/apache-hive-metastore-$HIVE_METASTORE_VERSION-bin /opt/hive-metastore && \
    ln -s /opt/postgresql-$POSTGRES_CONNECTOR_VERSION.jar /opt/hadoop/share/hadoop/common/lib/ && \
    ln -s /opt/postgresql-$POSTGRES_CONNECTOR_VERSION.jar /opt/hive-metastore/lib/ && \
    ln -s /opt/log4j-web-2.17.1.jar /opt/hadoop/share/hadoop/common/lib/ && \
    ln -s /opt/log4j-web-2.17.1.jar /opt/hive-metastore/lib/

# Add S3a jars to hadoop classpath using this hack
RUN ln -s /opt/hadoop/share/hadoop/tools/lib/hadoop-aws* /opt/hadoop/share/hadoop/common/lib/ && \
    ln -s /opt/hadoop/share/hadoop/tools/lib/aws-java-sdk* /opt/hadoop/share/hadoop/common/lib/ && \
    ln -s /opt/hadoop/share/hadoop/tools/lib/hadoop-aws* /opt/hive-metastore/lib/ && \
    ln -s /opt/hadoop/share/hadoop/tools/lib/aws-java-sdk* /opt/hive-metastore/lib/ && \
    ln -s /opt/hadoop/share/hadoop/common/hadoop-common-${HADOOP_VERSION}.jar /opt/hive-metastore/lib/

# Purge build artifacts
RUN apt-get purge -y --auto-remove curl && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

COPY run.sh run.sh
RUN chmod +x run.sh

ENTRYPOINT [ "./run.sh" ]
HEALTHCHECK CMD [ "sh", "-c", "netstat -ln | grep 9083" ]

And I slightly modified run.sh to meet my requirements:

#!/usr/bin/env bash

set -euxo pipefail

generate_database_config(){
  cat << XML
<property>
  <name>javax.jdo.option.ConnectionDriverName</name>
  <value>${DATABASE_DRIVER}</value>
</property>
<property>
  <name>javax.jdo.option.ConnectionURL</name>
  <value>jdbc:${DATABASE_TYPE_JDBC}://${DATABASE_HOST}:${DATABASE_PORT}/${DATABASE_DB}</value>
</property>
<property>
  <name>javax.jdo.option.ConnectionUserName</name>
  <value>${DATABASE_USER}</value>
</property>
<property>
  <name>javax.jdo.option.ConnectionPassword</name>
  <value>${DATABASE_PASSWORD}</value>
</property>
<property>
  <name>hive.metastore.uris</name>
  <value>thrift://hive-metastore-${S3_BUCKET}.hive.svc.cluster.local:9083</value>
  <description>Thrift URI for the remote metastore. Used by metastore client to connect to remote metastore.</description>
</property>
XML
}

generate_log4j2_config(){
  cat << PROPERTIES > "$1"
name = metastore

appenders = console

appender.console.type = Console
appender.console.name = consoleLogger
appender.console.layout.type = PatternLayout
appender.console.layout.pattern = %d{MM/dd/yy HH:mm:ss} %p %c: %m%n
    
loggers = meta

logger.meta.name = org.apache.hadoop.hive.metastore
logger.meta.level = DEBUG

logger.meta.name = org.apache.hadoop.fs.s3a
logger.meta.level = DEBUG 

logger.hive.name = org.apache.hive
logger.hive.level = DEBUG

logger.datanucleusorg.name = org.datanucleus
logger.datanucleusorg.level = DEBUG 

logger.datanucleus.name = DataNucleus
logger.datanucleus.level = DEBUG

rootLogger.level = DEBUG 
rootLogger.appenderRefs = console
rootLogger.appenderRef.console.ref = consoleLogger
PROPERTIES
}

generate_hive_site_config(){
  database_config=$(generate_database_config)
  cat << XML > "$1"
<configuration>
$database_config
</configuration>
XML
}

generate_metastore_site_config(){
  database_config=$(generate_database_config)
  cat << XML > "$1"
<configuration>
  <property>
    <name>metastore.task.threads.always</name>
    <value>org.apache.hadoop.hive.metastore.events.EventCleanerTask</value>
  </property>
  <property>
    <name>metastore.expression.proxy</name>
    <value>org.apache.hadoop.hive.metastore.DefaultPartitionExpressionProxy</value>
  </property>
  $database_config
  <property>
    <name>metastore.warehouse.dir</name>
    <value>s3a://${S3_BUCKET}/</value>
  </property>
  <property>
    <name>metastore.thrift.port</name>
    <value>9083</value>
  </property>
  <property>
    <name>metastore.thrift.uris</name>
    <value>thrift://hive-metastore-${S3_BUCKET}.hive.svc.cluster.local:9083</value>
    <description>Thrift URI for the remote metastore. Used by metastore client to connect to remote metastore.</description>
  </property>
</configuration>
XML
}

generate_s3_custom_endpoint(){
  if [ -z "$S3_ENDPOINT_URL" ]; then
    echo ""
    return 0
  fi

  cat << XML
<property>
  <name>fs.s3a.endpoint</name>
  <value>${S3_ENDPOINT_URL}</value>
</property>
<property>
  <name>fs.s3a.access.key</name>
  <value>${AWS_ACCESS_KEY_ID:-}</value>
</property>
<property>
  <name>fs.s3a.secret.key</name>
  <value>${AWS_SECRET_ACCESS_KEY:-}</value>
</property>
<property>
  <name>fs.s3a.connection.ssl.enabled</name>
  <value>true</value>
</property>
<property>
  <name>fs.s3a.path.style.access</name>
  <value>true</value>
</property>
<property>
  <name>fs.s3a.signing-algorithm</name>
  <value>S3SignerType</value>
</property>
XML
}

generate_core_site_config(){
  custom_endpoint_configs=$(generate_s3_custom_endpoint)
  cat << XML > "$1"
<configuration>
  <property>
      <name>fs.defaultFS</name>
      <value>s3a://${S3_BUCKET}</value>
  </property>
  <property>
      <name>fs.s3a.impl</name>
      <value>org.apache.hadoop.fs.s3a.S3AFileSystem</value>
  </property>
  $custom_endpoint_configs
</configuration>
XML
}

run_migrations(){
  if /opt/hive-metastore/bin/schematool -dbType "$DATABASE_TYPE" -validate | grep 'Done with metastore validation' | grep '[SUCCESS]'; then
    echo 'Database OK'
    return 0
  else
    # TODO: how to apply new version migrations or repair validation issues
    /opt/hive-metastore/bin/schematool --verbose -dbType "$DATABASE_TYPE" -initSchema
  fi
}

# configure & run schematool
generate_hive_site_config /opt/hadoop/etc/hadoop/hive-site.xml
run_migrations

# configure & start metastore (in foreground)
generate_log4j2_config /opt/hive-metastore/conf/log4j2.properties
generate_metastore_site_config /opt/hive-metastore/conf/metastore-site.xml
generate_core_site_config /opt/hadoop/etc/hadoop/core-site.xml
/opt/hive-metastore/bin/start-metastore

As you can see I tried to add more symlinks to the hadoop-aws and aws-bundle jars but nothing has changed.
I also tried to improve logging to see if the classpath is correct but it breaks the image.

Any suggestion on how to solve the problem both for the missing class and the logging?

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions