Upgrade to leverage Spark 3.5.1 (#249)

msmygit · web-flow · commit f5d5a00647cc · 2024-02-27T13:54:55.000-05:00
diff --git a/Dockerfile b/Dockerfile
@@ -9,9 +9,9 @@ RUN mkdir -p /assets/ && cd /assets && \
     curl -OL https://downloads.datastax.com/enterprise/cqlsh-astra.tar.gz && \
     tar -xzf ./cqlsh-astra.tar.gz && \
     rm ./cqlsh-astra.tar.gz && \
-    curl -OL https://archive.apache.org/dist/spark/spark-3.4.2/spark-3.4.2-bin-hadoop3-scala2.13.tgz && \
-    tar -xzf ./spark-3.4.2-bin-hadoop3-scala2.13.tgz && \
-    rm ./spark-3.4.2-bin-hadoop3-scala2.13.tgz
+    curl -OL https://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3-scala2.13.tgz && \
+    tar -xzf ./spark-3.5.1-bin-hadoop3-scala2.13.tgz && \
+    rm ./spark-3.5.1-bin-hadoop3-scala2.13.tgz
 
 RUN apt-get update && apt-get install -y openssh-server vim python3 --no-install-recommends && \
     rm -rf /var/lib/apt/lists/* && \
@@ -46,7 +46,7 @@ RUN chmod +x ./get-latest-maven-version.sh && \
     rm -rf "$USER_HOME_DIR/.m2"
 
 # Add all migration tools to path
-ENV PATH="${PATH}:/assets/dsbulk/bin/:/assets/cqlsh-astra/bin/:/assets/spark-3.4.2-bin-hadoop3-scala2.13/bin/"
+ENV PATH="${PATH}:/assets/dsbulk/bin/:/assets/cqlsh-astra/bin/:/assets/spark-3.5.1-bin-hadoop3-scala2.13/bin/"
 
 EXPOSE 22
 
diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@
 
 Migrate and Validate Tables between Origin and Target Cassandra Clusters.
 
-> :warning: Please note this job has been tested with spark version [3.4.2](https://archive.apache.org/dist/spark/spark-3.4.2/)
+> :warning: Please note this job has been tested with spark version [3.5.1](https://archive.apache.org/dist/spark/spark-3.5.1/)
 
 ## Install as a Container
 - Get the latest image that includes all dependencies from [DockerHub](https://hub.docker.com/r/datastax/cassandra-data-migrator)
@@ -18,10 +18,10 @@ Migrate and Validate Tables between Origin and Target Cassandra Clusters.
 
 ### Prerequisite
 - Install **Java11** (minimum) as Spark binaries are compiled with it.
-- Install Spark version [`3.4.2`](https://archive.apache.org/dist/spark/spark-3.4.2/spark-3.4.2-bin-hadoop3-scala2.13.tgz) on a single VM (no cluster necessary) where you want to run this job. Spark can be installed by running the following: -
+- Install Spark version [`3.5.1`](https://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3-scala2.13.tgz) on a single VM (no cluster necessary) where you want to run this job. Spark can be installed by running the following: -
 ```
-wget https://archive.apache.org/dist/spark/spark-3.4.2/spark-3.4.2-bin-hadoop3-scala2.13.tgz
-tar -xvzf spark-3.4.2-bin-hadoop3-scala2.13.tgz
+wget https://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3-scala2.13.tgz
+tar -xvzf spark-3.5.1-bin-hadoop3-scala2.13.tgz
 ```
 
 > :warning: If the above Spark and Scala version is not properly installed, you'll then see a similar exception like below when running the CDM jobs,
diff --git a/RELEASE.md b/RELEASE.md
@@ -1,4 +1,7 @@
 # Release Notes
+## [4.1.13] - 2024-02-27
+- Upgraded to use Spark `3.5.1`.
+
 ## [4.1.12] - 2024-01-22
 - Upgraded to use Spark `3.4.2`.
 - Added Java `11` as the minimally required pre-requisite to run CDM jobs.
diff --git a/pom.xml b/pom.xml
@@ -10,9 +10,9 @@
     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
     <scala.version>2.13.12</scala.version>
     <scala.main.version>2.13</scala.main.version>
-    <spark.version>3.4.1</spark.version>
-    <connector.version>3.4.1</connector.version>
-    <cassandra.version>5.0-alpha1</cassandra.version>
+    <spark.version>3.5.1</spark.version>
+    <connector.version>3.5.0</connector.version>
+    <cassandra.version>5.0-beta1</cassandra.version>
     <junit.version>5.9.1</junit.version>
     <mockito.version>4.11.0</mockito.version>
     <java-driver.version>4.17.0</java-driver.version>
@@ -198,7 +198,7 @@
       <plugin>
       <groupId>net.alchim31.maven</groupId>
       <artifactId>scala-maven-plugin</artifactId>
-      <version>4.8.0</version>
+      <version>4.8.1</version>
       <executions>
         <execution>
           <phase>process-sources</phase>
diff --git a/rat-excludes.txt b/rat-excludes.txt
@@ -6,6 +6,7 @@
 .github/workflows/maven.yml
 .github/workflows/snyk-cli-scan.yml
 .github/workflows/snyk-pr-cleanup.yml
+.github/workflows/dependabot.yml
 README.md
 rat-excludes.txt
 pom.xml
@@ -19,7 +20,9 @@ Dockerfile
 .snyk
 .snyk.ignore.example
 PERF/*
+PERF/*/*/output/*
 SIT/*
+SIT/*/*/output/*
 scripts/*
 test-backup/feature/*
 src/resources/partitions.csv
@@ -81,6 +84,7 @@ SIT/smoke/04_counters/cdm.validateData.assert
 SIT/smoke/04_counters/cdm.fixForce.assert
 SIT/smoke/05_reserved_keyword/cdm.txt
 SIT/smoke/05_reserved_keyword/expected.out
+SIT/smoke_inflight/06_vector/cdm.sh
 PERF/logs/scenario_20230523_162859_122.log
 PERF/logs/scenario_20230523_162126_056.log
 PERF/logs/scenario_20230523_162204_904.log
diff --git a/src/resources/migrate_data.sh b/src/resources/migrate_data.sh
@@ -35,7 +35,7 @@
 ###########################################################################################################################
 
 # Path to spark-submit
-SPARK_SUBMIT=/home/ubuntu/spark-3.4.2-bin-hadoop3-scala2.13/bin/spark-submit
+SPARK_SUBMIT=/home/ubuntu/spark-3.5.1-bin-hadoop3-scala2.13/bin/spark-submit
 
 # Path to spark configuration for the table
 PROPS_FILE=/home/ubuntu/sparkConf.properties