@@ -13,11 +13,112 @@ mkdir $SPARK_HOME/conf
1313echo " SPARK_LOCAL_IP=127.0.0.1" > $SPARK_HOME /conf/spark-env.sh
1414echo " JAVA_HOME=/usr/lib/jvm/$( ls /usr/lib/jvm | grep java) /jre" >> $SPARK_HOME /conf/spark-env.sh
1515
16+ # Download core S3 filesystem JARs with updated versions
17+ wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION} /hadoop-aws-${HADOOP_VERSION} .jar -P ${SPARK_HOME} /jars/
18+ wget -q https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION} /aws-java-sdk-bundle-${AWS_SDK_VERSION} .jar -P ${SPARK_HOME} /jars/
1619
20+ # Additional JARs for better S3 compatibility
21+ wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-common/${HADOOP_VERSION} /hadoop-common-${HADOOP_VERSION} .jar -P ${SPARK_HOME} /jars/
22+ wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client/${HADOOP_VERSION} /hadoop-client-${HADOOP_VERSION} .jar -P ${SPARK_HOME} /jars/
23+ wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client-api/${HADOOP_VERSION} /hadoop-client-api-${HADOOP_VERSION} .jar -P ${SPARK_HOME} /jars/
24+ wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client-runtime/${HADOOP_VERSION} /hadoop-client-runtime-${HADOOP_VERSION} .jar -P ${SPARK_HOME} /jars/
1725
26+ # Add Hadoop statistics and fs libraries to fix NoSuchMethodError
27+ wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-annotations/${HADOOP_VERSION} /hadoop-annotations-${HADOOP_VERSION} .jar -P ${SPARK_HOME} /jars/
28+ wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-auth/${HADOOP_VERSION} /hadoop-auth-${HADOOP_VERSION} .jar -P ${SPARK_HOME} /jars/
29+ wget -q https://repo1.maven.org/maven2/org/apache/hadoop/thirdparty/hadoop-shaded-guava/${HADOOP_VERSION} /hadoop-shaded-guava-${HADOOP_VERSION} .jar -P ${SPARK_HOME} /jars/
30+ wget -q https://repo1.maven.org/maven2/org/apache/hadoop/thirdparty/hadoop-shaded-protobuf_3_7/${HADOOP_VERSION} /hadoop-shaded-protobuf_3_7-${HADOOP_VERSION} .jar -P ${SPARK_HOME} /jars/
31+ wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-mapreduce-client-core/${HADOOP_VERSION} /hadoop-mapreduce-client-core-${HADOOP_VERSION} .jar -P ${SPARK_HOME} /jars/
32+ wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-mapreduce-client-common/${HADOOP_VERSION} /hadoop-mapreduce-client-common-${HADOOP_VERSION} .jar -P ${SPARK_HOME} /jars/
33+ wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-hdfs-client/${HADOOP_VERSION} /hadoop-hdfs-client-${HADOOP_VERSION} .jar -P ${SPARK_HOME} /jars/
34+
35+ # Add additional Hadoop libraries to fix S3A filesystem issues
36+ wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client-api/${HADOOP_VERSION} /hadoop-client-api-${HADOOP_VERSION} .jar -P ${SPARK_HOME} /jars/
37+ wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client-runtime/${HADOOP_VERSION} /hadoop-client-runtime-${HADOOP_VERSION} .jar -P ${SPARK_HOME} /jars/
38+
39+ # Fix for IOStatisticsBinding NoSuchMethodError
40+ # Download specific version that contains the required IOStatisticsBinding class
41+ FIXED_VERSION=" 3.3.4"
42+ echo " Downloading fixed Hadoop libraries version $FIXED_VERSION "
43+ wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-common/$FIXED_VERSION /hadoop-common-$FIXED_VERSION .jar -P ${SPARK_HOME} /jars/
44+ wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/$FIXED_VERSION /hadoop-aws-$FIXED_VERSION .jar -P ${SPARK_HOME} /jars/
45+
46+ # Download specific statistics implementation jars
47+ wget -q https://repo1.maven.org/maven2/org/apache/hadoop/thirdparty/hadoop-shaded-guava/1.1.1/hadoop-shaded-guava-1.1.1.jar -P ${SPARK_HOME} /jars/ || echo " hadoop-shaded-guava not found"
48+
49+ # Download specific fs-statistics JAR that contains IOStatisticsBinding
50+ wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-common/$FIXED_VERSION /hadoop-common-$FIXED_VERSION -tests.jar -P ${SPARK_HOME} /jars/ || echo " hadoop-common-tests not found"
51+
52+ # Download additional S3A implementation classes
53+ wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/$FIXED_VERSION /hadoop-aws-$FIXED_VERSION -tests.jar -P ${SPARK_HOME} /jars/ || echo " hadoop-aws-tests not found"
54+
55+ # Copy the existing log4j.properties file to the Spark conf directory
56+ echo " Copying existing log4j.properties file to Spark conf directory"
57+ cp /opt/spark-on-lambda-handler/log4j.properties ${SPARK_HOME} /conf/
58+
59+ # Create a core-site.xml file with S3A configurations
60+ echo " Creating core-site.xml file"
61+ cat > ${SPARK_HOME} /conf/core-site.xml << EOL
62+ <?xml version="1.0"?>
63+ <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
64+ <configuration>
65+ <property>
66+ <name>fs.s3a.impl</name>
67+ <value>org.apache.hadoop.fs.s3a.S3AFileSystem</value>
68+ </property>
69+ <property>
70+ <name>fs.s3a.aws.credentials.provider</name>
71+ <value>com.amazonaws.auth.DefaultAWSCredentialsProviderChain</value>
72+ </property>
73+ <property>
74+ <name>fs.s3a.connection.maximum</name>
75+ <value>100</value>
76+ </property>
77+ <property>
78+ <name>fs.s3a.experimental.input.fadvise</name>
79+ <value>sequential</value>
80+ </property>
81+ <property>
82+ <name>fs.s3a.impl.disable.cache</name>
83+ <value>true</value>
84+ </property>
85+ <property>
86+ <name>fs.s3a.path.style.access</name>
87+ <value>true</value>
88+ </property>
89+ <property>
90+ <name>fs.s3a.committer.name</name>
91+ <value>directory</value>
92+ </property>
93+ <property>
94+ <name>fs.s3a.committer.staging.conflict-mode</name>
95+ <value>append</value>
96+ </property>
97+ <property>
98+ <name>fs.s3a.committer.staging.unique-filenames</name>
99+ <value>true</value>
100+ </property>
101+ <property>
102+ <name>fs.s3a.fast.upload</name>
103+ <value>true</value>
104+ </property>
105+ <property>
106+ <name>mapreduce.fileoutputcommitter.algorithm.version</name>
107+ <value>2</value>
108+ </property>
109+ </configuration>
110+ EOL
111+
112+ # Add AWS SDK v2components for better S3 compatibility
113+ wget -q https://repo1.maven.org/maven2/software/amazon/awssdk/s3/2.20.56/s3-2.20.56.jar -P ${SPARK_HOME} /jars/
114+ wget -q https://repo1.maven.org/maven2/software/amazon/awssdk/utils/2.20.56/utils-2.20.56.jar -P ${SPARK_HOME} /jars/
115+ wget -q https://repo1.maven.org/maven2/software/amazon/awssdk/auth/2.20.56/auth-2.20.56.jar -P ${SPARK_HOME} /jars/
116+ wget -q https://repo1.maven.org/maven2/software/amazon/awssdk/http-client-spi/2.20.56/http-client-spi-2.20.56.jar -P ${SPARK_HOME} /jars/
117+ wget -q https://repo1.maven.org/maven2/software/amazon/awssdk/regions/2.20.56/regions-2.20.56.jar -P ${SPARK_HOME} /jars/
118+ wget -q https://repo1.maven.org/maven2/software/amazon/awssdk/sdk-core/2.20.56/sdk-core-2.20.56.jar -P ${SPARK_HOME} /jars/
119+ wget -q https://repo1.maven.org/maven2/software/amazon/awssdk/apache-client/2.20.56/apache-client-2.20.56.jar -P ${SPARK_HOME} /jars/
120+ wget -q https://repo1.maven.org/maven2/software/amazon/awssdk/aws-core/2.20.56/aws-core-2.20.56.jar -P ${SPARK_HOME} /jars/
18121
19- wget -q https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION} /hadoop-aws-${HADOOP_VERSION} .jar -P ${SPARK_HOME} /jars/
20- wget -q https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION} /aws-java-sdk-bundle-${AWS_SDK_VERSION} .jar -P ${SPARK_HOME} /jars/
21122# jar files needed to conncet to Snowflake
22123# wget -q https://repo1.maven.org/maven2/net/snowflake/spark-snowflake_2.12/2.12.0-spark_3.3/spark-snowflake_2.12-2.12.0-spark_3.3.jar -P ${SPARK_HOME}/jars/
23124# wget -q https://repo1.maven.org/maven2/net/snowflake/snowflake-jdbc/3.13.33/snowflake-jdbc-3.13.33.jar -P ${SPARK_HOME}/jars/
@@ -61,4 +162,4 @@ echo $fw
61162 echo " Unknown framework: $fw "
62163 ;;
63164 esac
64- done
165+ done
0 commit comments