Skip to content

Commit c22eb87

Browse files
MariamalmesferMariam-Almesfer
andauthored
[VL] Add S3 integration tests to gluten (#11516)
Co-authored-by: Mariam-Almesfer <mariam.almesfer@ibm.com>
1 parent a4e5655 commit c22eb87

File tree

5 files changed

+87
-3
lines changed

5 files changed

+87
-3
lines changed

.github/workflows/util/install-resources.sh

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,66 @@ EOF
9393
"$HADOOP_HOME/bin/hdfs" dfs -ls /
9494
}
9595

96+
function install_minio {
97+
echo "Installing MinIO..."
98+
99+
apt-get update -y
100+
apt-get install -y curl
101+
102+
curl -fsSL -o /usr/local/bin/minio https://dl.min.io/server/minio/release/linux-amd64/minio
103+
chmod +x /usr/local/bin/minio
104+
105+
curl -fsSL -o /usr/local/bin/mc https://dl.min.io/client/mc/release/linux-amd64/mc
106+
chmod +x /usr/local/bin/mc
107+
108+
echo "MinIO installed successfully"
109+
}
110+
111+
function setup_minio {
112+
local spark_version="${1:-3.5}"
113+
local spark_version_short=$(echo "${spark_version}" | cut -d '.' -f 1,2 | tr -d '.')
114+
115+
case "$spark_version" in
116+
3.3) hadoop_aws_version="3.3.2"; aws_sdk_artifact="aws-java-sdk-bundle"; aws_sdk_version="1.12.262" ;;
117+
3.4|3.5*) hadoop_aws_version="3.3.4"; aws_sdk_artifact="aws-java-sdk-bundle"; aws_sdk_version="1.12.262" ;;
118+
4.0) hadoop_aws_version="3.4.0"; aws_sdk_artifact="bundle"; aws_sdk_version="2.25.11" ;;
119+
4.1) hadoop_aws_version="3.4.1"; aws_sdk_artifact="bundle"; aws_sdk_version="2.25.11" ;;
120+
*) hadoop_aws_version="3.3.4"; aws_sdk_artifact="aws-java-sdk-bundle"; aws_sdk_version="1.12.262" ;;
121+
esac
122+
123+
local spark_jars_dir="${GITHUB_WORKSPACE:-$PWD}/tools/gluten-it/package/target/lib"
124+
mkdir -p "$spark_jars_dir"
125+
126+
wget -nv https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${hadoop_aws_version}/hadoop-aws-${hadoop_aws_version}.jar -P "$spark_jars_dir" || return 1
127+
128+
if [ "$aws_sdk_artifact" == "aws-java-sdk-bundle" ]; then
129+
wget -nv https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${aws_sdk_version}/aws-java-sdk-bundle-${aws_sdk_version}.jar -P "$spark_jars_dir" || return 1
130+
else
131+
wget -nv https://repo1.maven.org/maven2/software/amazon/awssdk/bundle/${aws_sdk_version}/bundle-${aws_sdk_version}.jar -P "$spark_jars_dir" || return 1
132+
fi
133+
134+
export MINIO_DATA_DIR="${RUNNER_TEMP:-/tmp}/minio-data"
135+
mkdir -p "$MINIO_DATA_DIR"
136+
export MINIO_ROOT_USER=admin
137+
export MINIO_ROOT_PASSWORD=admin123
138+
139+
nohup minio server --address ":9100" --console-address ":9101" "$MINIO_DATA_DIR" > /tmp/minio.log 2>&1 &
140+
141+
for i in {1..60}; do
142+
curl -sSf http://localhost:9100/minio/health/ready >/dev/null 2>&1 && break
143+
sleep 1
144+
done
145+
146+
if ! curl -sSf http://localhost:9100/minio/health/ready >/dev/null 2>&1; then
147+
echo "MinIO failed to start"
148+
cat /tmp/minio.log || true
149+
exit 1
150+
fi
151+
152+
mc alias set s3local http://localhost:9100 "$MINIO_ROOT_USER" "$MINIO_ROOT_PASSWORD"
153+
mc mb -p s3local/gluten-it || true
154+
}
155+
96156
# Installs Spark binary and source releases with:
97157
# 1 - spark version
98158
# 2 - hadoop version

.github/workflows/velox_backend_x86.yml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,15 @@ jobs:
183183
source .github/workflows/util/install-resources.sh
184184
install_hadoop
185185
setup_hdfs
186+
- name: Install MinIO
187+
if: matrix.os == 'ubuntu:22.04' && matrix.spark == 'spark-3.5' && matrix.java == 'java-8'
188+
shell: bash
189+
run: |
190+
export JAVA_HOME=/usr/lib/jvm/${{ matrix.java }}-openjdk-amd64
191+
source .github/workflows/util/install-resources.sh
192+
install_minio
186193
- name: Build and run TPC-H / TPC-DS
194+
shell: bash
187195
run: |
188196
cd $GITHUB_WORKSPACE/
189197
export JAVA_HOME=/usr/lib/jvm/${{ matrix.java }}-openjdk-amd64
@@ -198,6 +206,14 @@ jobs:
198206
esac
199207
cd $GITHUB_WORKSPACE/tools/gluten-it
200208
$GITHUB_WORKSPACE/$MVN_CMD clean install -P${{ matrix.spark }} -P${{ matrix.java }}
209+
# Setup S3 JARs after gluten-it build
210+
if [ "${{ matrix.os }}" = "ubuntu:22.04" ] && \
211+
[ "${{ matrix.spark }}" = "spark-3.5" ] && \
212+
[ "${{ matrix.java }}" = "java-8" ]; then
213+
source $GITHUB_WORKSPACE/.github/workflows/util/install-resources.sh
214+
SPARK_VERSION=$(echo "${{ matrix.spark }}" | sed 's/spark-//')
215+
setup_minio "$SPARK_VERSION"
216+
fi
201217
GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \
202218
--local --preset=velox --benchmark-type=h --error-on-memleak --off-heap-size=10g -s=1.0 --threads=16 --iterations=1 \
203219
&& GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh queries-compare \
@@ -208,6 +224,14 @@ jobs:
208224
GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh queries-compare \
209225
--local --preset=velox --benchmark-type=h --error-on-memleak --off-heap-size=10g -s=1.0 --threads=16 --iterations=1 \
210226
--queries=q1 --data-dir="hdfs://localhost:9000/test"
227+
GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh queries-compare \
228+
--local --preset=velox --benchmark-type=h --error-on-memleak --off-heap-size=10g -s=1.0 --threads=16 --iterations=1 \
229+
--queries=q1 --data-dir="s3a://gluten-it/test" \
230+
--extra-conf=spark.hadoop.fs.s3a.endpoint=http://localhost:9100 \
231+
--extra-conf=spark.hadoop.fs.s3a.access.key=admin \
232+
--extra-conf=spark.hadoop.fs.s3a.secret.key=admin123 \
233+
--extra-conf=spark.hadoop.fs.s3a.path.style.access=true \
234+
--extra-conf=spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem
211235
fi
212236
213237
tpc-test-centos8:

tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/QueryRunner.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ class QueryRunner(val source: String, val dataPath: String) {
6767
}
6868

6969
private def fileExists(datapath: String): Boolean = {
70-
if (datapath.startsWith("hdfs:")) {
70+
if (datapath.startsWith("hdfs:") || datapath.startsWith("s3a:")) {
7171
val uri = URI.create(datapath)
7272
FileSystem.get(uri, new Configuration()).exists(new Path(uri.getPath))
7373
} else new File(datapath).exists()

tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/ds/TpcdsSuite.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ class TpcdsSuite(
8181
"non_partitioned"
8282
}
8383
val featureFlags = dataGenFeatures.map(feature => s"-$feature").mkString("")
84-
if (dataDir.startsWith("hdfs://")) {
84+
if (dataDir.startsWith("hdfs://") || dataDir.startsWith("s3a://")) {
8585
return s"$dataDir/$TPCDS_WRITE_RELATIVE_PATH-$dataScale-$dataSource-$partitionedFlag$featureFlags"
8686
}
8787
new File(dataDir).toPath

tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/h/TpchSuite.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ class TpchSuite(
7676

7777
override private[integration] def dataWritePath(): String = {
7878
val featureFlags = dataGenFeatures.map(feature => s"-$feature").mkString("")
79-
if (dataDir.startsWith("hdfs://")) {
79+
if (dataDir.startsWith("hdfs://") || dataDir.startsWith("s3a://")) {
8080
return s"$dataDir/$TPCH_WRITE_RELATIVE_PATH-$dataScale-$dataSource$featureFlags"
8181
}
8282
new File(dataDir).toPath

0 commit comments

Comments
 (0)