-
Notifications
You must be signed in to change notification settings - Fork 31
Open
Description
I am using this image as a standalone hive metastore for trino.
It works perfectly fine when reading tables stored in MinIO, creating external tables from Parquet files in MinIO and creating schemas.
But as soon as I try an insert operation or more in general a write operation that should create parquet files in MinIO I get this error:
trino> create schema test.db with (location='s3a://test/db/');
CREATE SCHEMA
trino> create table test.db.table (currency varchar, date varchar, rate varchar) with (external_location='s3a://test/db/table/', format='PARQUET');
CREATE TABLE
trino> INSERT INTO test.db.table VALUES ('USD', '2024-02-15', '1.25');
Query 20250217_141259_00010_retkt failed: java.lang.ClassNotFoundException: Class org.apache.hadoop.fs.s3a.S3AFileSystem not foundThis is the Dockerfile I am using at the moment:
FROM openjdk:16-slim
ARG HADOOP_VERSION=3.3.0
ARG HIVE_METASTORE_VERSION=3.0.0
ARG POSTGRES_CONNECTOR_VERSION=42.2.18
# Set necessary environment variables.
ENV HADOOP_HOME="/opt/hadoop"
ENV PATH="/opt/spark/bin:/opt/hadoop/bin:${PATH}"
ENV DATABASE_DRIVER=org.postgresql.Driver
ENV DATABASE_TYPE=postgres
ENV DATABASE_TYPE_JDBC=postgresql
ENV DATABASE_PORT=5432
ENV HADOOP_CLASSPATH=${HADOOP_HOME}/share/hadoop/common/lib/*:${HADOOP_HOME}/share/hadoop/common/*:${HADOOP_HOME}/share/hadoop/tools/lib/*
WORKDIR /app
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
# hadolint ignore=DL3008,SC2086
# Install OS dependencies
RUN apt-get update -y && \
apt-get install -y curl net-tools --no-install-recommends
COPY hadoop-${HADOOP_VERSION} /opt/hadoop-${HADOOP_VERSION}
COPY apache-hive-metastore-${HIVE_METASTORE_VERSION}-bin /opt/apache-hive-metastore-${HIVE_METASTORE_VERSION}-bin
COPY log4j-web-2.17.1.jar /opt/log4j-web-2.17.1.jar
COPY postgresql-${POSTGRES_CONNECTOR_VERSION}.jar /opt/postgresql-${POSTGRES_CONNECTOR_VERSION}.jar
# Fix 'java.lang.NoSuchMethodError: com.google.common.base.Preconditions.checkArgument'
# Keep this until this lands: https://issues.apache.org/jira/browse/HIVE-22915
RUN rm /opt/apache-hive-metastore-$HIVE_METASTORE_VERSION-bin/lib/guava-19.0.jar && \
cp /opt/hadoop-$HADOOP_VERSION/share/hadoop/hdfs/lib/guava-27.0-jre.jar /opt/apache-hive-metastore-$HIVE_METASTORE_VERSION-bin/lib/
# Create symlinks to simplify configuration
RUN ln -s /opt/hadoop-$HADOOP_VERSION /opt/hadoop && \
ln -s /opt/apache-hive-metastore-$HIVE_METASTORE_VERSION-bin /opt/hive-metastore && \
ln -s /opt/postgresql-$POSTGRES_CONNECTOR_VERSION.jar /opt/hadoop/share/hadoop/common/lib/ && \
ln -s /opt/postgresql-$POSTGRES_CONNECTOR_VERSION.jar /opt/hive-metastore/lib/ && \
ln -s /opt/log4j-web-2.17.1.jar /opt/hadoop/share/hadoop/common/lib/ && \
ln -s /opt/log4j-web-2.17.1.jar /opt/hive-metastore/lib/
# Add S3a jars to hadoop classpath using this hack
RUN ln -s /opt/hadoop/share/hadoop/tools/lib/hadoop-aws* /opt/hadoop/share/hadoop/common/lib/ && \
ln -s /opt/hadoop/share/hadoop/tools/lib/aws-java-sdk* /opt/hadoop/share/hadoop/common/lib/ && \
ln -s /opt/hadoop/share/hadoop/tools/lib/hadoop-aws* /opt/hive-metastore/lib/ && \
ln -s /opt/hadoop/share/hadoop/tools/lib/aws-java-sdk* /opt/hive-metastore/lib/ && \
ln -s /opt/hadoop/share/hadoop/common/hadoop-common-${HADOOP_VERSION}.jar /opt/hive-metastore/lib/
# Purge build artifacts
RUN apt-get purge -y --auto-remove curl && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
COPY run.sh run.sh
RUN chmod +x run.sh
ENTRYPOINT [ "./run.sh" ]
HEALTHCHECK CMD [ "sh", "-c", "netstat -ln | grep 9083" ]And I slightly modified run.sh to meet my requirements:
#!/usr/bin/env bash
set -euxo pipefail
generate_database_config(){
cat << XML
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>${DATABASE_DRIVER}</value>
</property>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:${DATABASE_TYPE_JDBC}://${DATABASE_HOST}:${DATABASE_PORT}/${DATABASE_DB}</value>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>${DATABASE_USER}</value>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>${DATABASE_PASSWORD}</value>
</property>
<property>
<name>hive.metastore.uris</name>
<value>thrift://hive-metastore-${S3_BUCKET}.hive.svc.cluster.local:9083</value>
<description>Thrift URI for the remote metastore. Used by metastore client to connect to remote metastore.</description>
</property>
XML
}
generate_log4j2_config(){
cat << PROPERTIES > "$1"
name = metastore
appenders = console
appender.console.type = Console
appender.console.name = consoleLogger
appender.console.layout.type = PatternLayout
appender.console.layout.pattern = %d{MM/dd/yy HH:mm:ss} %p %c: %m%n
loggers = meta
logger.meta.name = org.apache.hadoop.hive.metastore
logger.meta.level = DEBUG
logger.meta.name = org.apache.hadoop.fs.s3a
logger.meta.level = DEBUG
logger.hive.name = org.apache.hive
logger.hive.level = DEBUG
logger.datanucleusorg.name = org.datanucleus
logger.datanucleusorg.level = DEBUG
logger.datanucleus.name = DataNucleus
logger.datanucleus.level = DEBUG
rootLogger.level = DEBUG
rootLogger.appenderRefs = console
rootLogger.appenderRef.console.ref = consoleLogger
PROPERTIES
}
generate_hive_site_config(){
database_config=$(generate_database_config)
cat << XML > "$1"
<configuration>
$database_config
</configuration>
XML
}
generate_metastore_site_config(){
database_config=$(generate_database_config)
cat << XML > "$1"
<configuration>
<property>
<name>metastore.task.threads.always</name>
<value>org.apache.hadoop.hive.metastore.events.EventCleanerTask</value>
</property>
<property>
<name>metastore.expression.proxy</name>
<value>org.apache.hadoop.hive.metastore.DefaultPartitionExpressionProxy</value>
</property>
$database_config
<property>
<name>metastore.warehouse.dir</name>
<value>s3a://${S3_BUCKET}/</value>
</property>
<property>
<name>metastore.thrift.port</name>
<value>9083</value>
</property>
<property>
<name>metastore.thrift.uris</name>
<value>thrift://hive-metastore-${S3_BUCKET}.hive.svc.cluster.local:9083</value>
<description>Thrift URI for the remote metastore. Used by metastore client to connect to remote metastore.</description>
</property>
</configuration>
XML
}
generate_s3_custom_endpoint(){
if [ -z "$S3_ENDPOINT_URL" ]; then
echo ""
return 0
fi
cat << XML
<property>
<name>fs.s3a.endpoint</name>
<value>${S3_ENDPOINT_URL}</value>
</property>
<property>
<name>fs.s3a.access.key</name>
<value>${AWS_ACCESS_KEY_ID:-}</value>
</property>
<property>
<name>fs.s3a.secret.key</name>
<value>${AWS_SECRET_ACCESS_KEY:-}</value>
</property>
<property>
<name>fs.s3a.connection.ssl.enabled</name>
<value>true</value>
</property>
<property>
<name>fs.s3a.path.style.access</name>
<value>true</value>
</property>
<property>
<name>fs.s3a.signing-algorithm</name>
<value>S3SignerType</value>
</property>
XML
}
generate_core_site_config(){
custom_endpoint_configs=$(generate_s3_custom_endpoint)
cat << XML > "$1"
<configuration>
<property>
<name>fs.defaultFS</name>
<value>s3a://${S3_BUCKET}</value>
</property>
<property>
<name>fs.s3a.impl</name>
<value>org.apache.hadoop.fs.s3a.S3AFileSystem</value>
</property>
$custom_endpoint_configs
</configuration>
XML
}
run_migrations(){
if /opt/hive-metastore/bin/schematool -dbType "$DATABASE_TYPE" -validate | grep 'Done with metastore validation' | grep '[SUCCESS]'; then
echo 'Database OK'
return 0
else
# TODO: how to apply new version migrations or repair validation issues
/opt/hive-metastore/bin/schematool --verbose -dbType "$DATABASE_TYPE" -initSchema
fi
}
# configure & run schematool
generate_hive_site_config /opt/hadoop/etc/hadoop/hive-site.xml
run_migrations
# configure & start metastore (in foreground)
generate_log4j2_config /opt/hive-metastore/conf/log4j2.properties
generate_metastore_site_config /opt/hive-metastore/conf/metastore-site.xml
generate_core_site_config /opt/hadoop/etc/hadoop/core-site.xml
/opt/hive-metastore/bin/start-metastoreAs you can see I tried to add more symlinks to the hadoop-aws and aws-bundle jars but nothing has changed.
I also tried to improve logging to see if the classpath is correct but it breaks the image.
Any suggestion on how to solve the problem both for the missing class and the logging?
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels