initial commit

mfmaher2 · mfmaher2 · commit 740782880fa0 · 2022-10-31T15:04:02.000-04:00
diff --git a/src/main/java/datastax/astra/migrate/AbstractJobSession.java b/src/main/java/datastax/astra/migrate/AbstractJobSession.java
@@ -154,15 +154,6 @@ protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession,
         }
     }
 
-    public List<MigrateDataType> getTypes(String types) {
-        List<MigrateDataType> dataTypes = new ArrayList<MigrateDataType>();
-        for (String type : types.split(",")) {
-            dataTypes.add(new MigrateDataType(type));
-        }
-
-        return dataTypes;
-    }
-
     public int getLargestTTL(Row sourceRow) {
         return IntStream.range(0, ttlCols.size())
                 .map(i -> sourceRow.getInt(selectColTypes.size() + i)).max().getAsInt();
@@ -184,21 +175,4 @@ public BoundStatement selectFromAstra(PreparedStatement selectStatement, Row sou
         return boundSelectStatement;
     }
 
-    public Object getData(MigrateDataType dataType, int index, Row sourceRow) {
-        if (dataType.typeClass == Map.class) {
-            return sourceRow.getMap(index, dataType.subTypes.get(0), dataType.subTypes.get(1));
-        } else if (dataType.typeClass == List.class) {
-            return sourceRow.getList(index, dataType.subTypes.get(0));
-        } else if (dataType.typeClass == Set.class) {
-            return sourceRow.getSet(index, dataType.subTypes.get(0));
-        } else if (isCounterTable && dataType.typeClass == Long.class) {
-            Object data = sourceRow.get(index, dataType.typeClass);
-            if (data == null) {
-                return new Long(0);
-            }
-        }
-
-        return sourceRow.get(index, dataType.typeClass);
-    }
-
 }
diff --git a/src/main/java/datastax/astra/migrate/BaseJobSession.java b/src/main/java/datastax/astra/migrate/BaseJobSession.java
@@ -2,10 +2,13 @@
 
 import com.datastax.oss.driver.api.core.CqlSession;
 import com.datastax.oss.driver.api.core.cql.PreparedStatement;
+import com.datastax.oss.driver.api.core.cql.Row;
 import com.datastax.oss.driver.shaded.guava.common.util.concurrent.RateLimiter;
 
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Map;
+import java.util.Set;
 
 public abstract class BaseJobSession {
 
@@ -46,4 +49,29 @@ public abstract class BaseJobSession {
 
     protected Boolean hasRandomPartitioner;
 
+    public List<MigrateDataType> getTypes(String types) {
+        List<MigrateDataType> dataTypes = new ArrayList<MigrateDataType>();
+        for (String type : types.split(",")) {
+            dataTypes.add(new MigrateDataType(type));
+        }
+
+        return dataTypes;
+    }
+
+    public Object getData(MigrateDataType dataType, int index, Row sourceRow) {
+        if (dataType.typeClass == Map.class) {
+            return sourceRow.getMap(index, dataType.subTypes.get(0), dataType.subTypes.get(1));
+        } else if (dataType.typeClass == List.class) {
+            return sourceRow.getList(index, dataType.subTypes.get(0));
+        } else if (dataType.typeClass == Set.class) {
+            return sourceRow.getSet(index, dataType.subTypes.get(0));
+        } else if (isCounterTable && dataType.typeClass == Long.class) {
+            Object data = sourceRow.get(index, dataType.typeClass);
+            if (data == null) {
+                return new Long(0);
+            }
+        }
+
+        return sourceRow.get(index, dataType.typeClass);
+    }
 }
diff --git a/src/main/java/datastax/astra/migrate/OriginCountJobSession.java b/src/main/java/datastax/astra/migrate/OriginCountJobSession.java
@@ -0,0 +1,157 @@
+package datastax.astra.migrate;
+
+import com.datastax.oss.driver.api.core.CqlSession;
+import com.datastax.oss.driver.api.core.cql.*;
+import com.datastax.oss.driver.shaded.guava.common.util.concurrent.RateLimiter;
+import org.apache.commons.lang.SerializationUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.spark.SparkConf;
+
+import java.io.Serializable;
+import java.math.BigInteger;
+import java.util.*;
+import java.util.concurrent.CompletionStage;
+import java.util.concurrent.atomic.AtomicLong;
+
+public class OriginCountJobSession extends BaseJobSession{
+    public Logger logger = LoggerFactory.getLogger(this.getClass().getName());
+    private static OriginCountJobSession originCountJobSession;
+    protected AtomicLong readCounter = new AtomicLong(0);
+    protected List<Integer> updateSelectMapping = new ArrayList<Integer>();
+    protected Boolean checkTableforColSize;
+    protected String checkTableforselectCols;
+    protected String filterColName;
+    protected String filterColType;
+    protected Integer filterColIndex;
+    protected List<MigrateDataType> checkTableforColSizeTypes = new ArrayList<MigrateDataType>();
+    public static OriginCountJobSession getInstance(CqlSession sourceSession, SparkConf sparkConf) {
+        if (originCountJobSession == null) {
+            synchronized (OriginCountJobSession.class) {
+                if (originCountJobSession == null) {
+                    originCountJobSession = new OriginCountJobSession(sourceSession, sparkConf);
+                }
+            }
+        }
+
+        return originCountJobSession;
+    }
+
+    protected OriginCountJobSession(CqlSession sourceSession, SparkConf sparkConf) {
+        this.sourceSession = sourceSession;
+        batchSize = new Integer(sparkConf.get("spark.batchSize", "1"));
+        printStatsAfter = new Integer(sparkConf.get("spark.printStatsAfter", "100000"));
+        if (printStatsAfter < 1) {
+            printStatsAfter = 100000;
+        }
+
+        readLimiter = RateLimiter.create(new Integer(sparkConf.get("spark.readRateLimit", "20000")));
+        sourceKeyspaceTable = sparkConf.get("spark.source.keyspaceTable");
+
+        hasRandomPartitioner = Boolean.parseBoolean(sparkConf.get("spark.source.hasRandomPartitioner", "false"));
+        isCounterTable = Boolean.parseBoolean(sparkConf.get("spark.source.counterTable", "false"));
+
+        checkTableforColSize = Boolean.parseBoolean(sparkConf.get("spark.source.checkTableforColSize", "false"));
+        checkTableforselectCols = sparkConf.get("spark.source.checkTableforColSize.cols");
+        checkTableforColSizeTypes = getTypes(sparkConf.get("spark.source.checkTableforColSize.cols.types"));
+        filterColName = sparkConf.get("spark.source.FilterColumn");
+        filterColType = sparkConf.get("spark.source.FilterColumnType");
+        filterColIndex =  Integer.parseInt(sparkConf.get("spark.source.FilterColumnIndex", "0"));
+
+        String partionKey = sparkConf.get("spark.query.cols.partitionKey");
+        idColTypes = getTypes(sparkConf.get("spark.query.cols.id.types"));
+
+        String selectCols = sparkConf.get("spark.query.cols.select");
+        String updateSelectMappingStr = sparkConf.get("spark.source.counterTable.update.select.index", "0");
+        for (String updateSelectIndex : updateSelectMappingStr.split(",")) {
+            updateSelectMapping.add(Integer.parseInt(updateSelectIndex));
+        }
+        String sourceSelectCondition = sparkConf.get("spark.query.cols.select.condition", "");
+        sourceSelectStatement = sourceSession.prepare(
+                "select " + selectCols + " from " + sourceKeyspaceTable + " where token(" + partionKey.trim()
+                        + ") >= ? and token(" + partionKey.trim() + ") <= ?  " + sourceSelectCondition + " ALLOW FILTERING");
+
+    }
+
+    public void getData(BigInteger min, BigInteger max) {
+        logger.info("TreadID: " + Thread.currentThread().getId() + " Processing min: " + min + " max:" + max);
+        int maxAttempts = maxRetries;
+        for (int retryCount = 1; retryCount <= maxAttempts; retryCount++) {
+
+            try {
+                ResultSet resultSet = sourceSession.execute(sourceSelectStatement.bind(hasRandomPartitioner ? min : min.longValueExact(), hasRandomPartitioner ? max : max.longValueExact()));
+                Collection<CompletionStage<AsyncResultSet>> writeResults = new ArrayList<CompletionStage<AsyncResultSet>>();
+
+                // cannot do batching if the writeFilter is greater than 0 or
+                // maxWriteTimeStampFilter is less than max long
+                // do not batch for counters as it adds latency & increases chance of discrepancy
+                if (batchSize == 1 || writeTimeStampFilter || isCounterTable) {
+                    for (Row sourceRow : resultSet) {
+                        readLimiter.acquire(1);
+
+                        if(checkTableforColSize) {
+                            int rowColcnt = GetRowColumnLength(sourceRow, filterColType, filterColIndex);
+                            String result = "";
+                            if (rowColcnt > 1024 * 1024 * 10) {
+                                for (int index = 0; index < checkTableforColSizeTypes.size(); index++) {
+                                    MigrateDataType dataType = checkTableforColSizeTypes.get(index);
+                                    Object colData = getData(dataType, index, sourceRow);
+                                    String[] colName = checkTableforselectCols.split(",");
+                                    result = result + " - " + colName[index] + " : " + colData;
+                                }
+                                logger.error("ThreadID: " + Thread.currentThread().getId() + result + " - " + filterColName + " length: " + rowColcnt);
+                                continue;
+                            }
+                        }
+                    }
+
+                } else {
+                    BatchStatement batchStatement = BatchStatement.newInstance(BatchType.UNLOGGED);
+                    for (Row sourceRow : resultSet) {
+                        readLimiter.acquire(1);
+                        writeLimiter.acquire(1);
+
+                        if(checkTableforColSize) {
+                            int rowColcnt = GetRowColumnLength(sourceRow, filterColType, filterColIndex);
+                            String result = "";
+                            if (rowColcnt > 1024 * 1024 * 10) {
+                                for (int index = 0; index < checkTableforColSizeTypes.size(); index++) {
+                                    MigrateDataType dataType = checkTableforColSizeTypes.get(index);
+                                    Object colData = getData(dataType, index, sourceRow);
+                                    String[] colName = checkTableforselectCols.split(",");
+                                    result = result + " - " + colName[index] + " : " + colData;
+                                }
+                                logger.error("ThreadID: " + Thread.currentThread().getId() + result + " - " + filterColName + " length: " + rowColcnt);
+                                continue;
+                            }
+                        }
+
+                        if (readCounter.incrementAndGet() % 1000 == 0) {
+                            logger.info("TreadID: " + Thread.currentThread().getId() + " Read Record Count: " + readCounter.get());
+                        }
+
+                    }
+                }
+
+
+                logger.info("TreadID: " + Thread.currentThread().getId() + " Final Read Record Count: " + readCounter.get());
+                retryCount = maxAttempts;
+            } catch (Exception e) {
+                logger.error("Error occurred retry#: " + retryCount, e);
+                logger.error("Error with PartitionRange -- TreadID: " + Thread.currentThread().getId() + " Processing min: " + min + " max:" + max + "    -- Retry# " + retryCount);
+            }
+        }
+
+    }
+
+    private int GetRowColumnLength(Row sourceRow, String filterColType, Integer filterColIndex) {
+        int i = 0;
+        Object colData = getData(new MigrateDataType(filterColType), filterColIndex, sourceRow);
+        byte[] colBytes = SerializationUtils.serialize((Serializable) colData);
+        i = colBytes.length;
+        if (i > 1024*1024*10)
+            return i;
+        return i;
+    }
+
+}
diff --git a/src/main/scala/datastax/astra/migrate/OriginData.scala b/src/main/scala/datastax/astra/migrate/OriginData.scala
@@ -0,0 +1,83 @@
+package datastax.astra.migrate
+
+import com.datastax.spark.connector.cql.CassandraConnector
+import org.slf4j.LoggerFactory
+
+import scala.collection.JavaConversions._
+
+object OriginData extends  BaseJob {
+
+  val logger = LoggerFactory.getLogger(this.getClass.getName)
+  logger.info("Started Migration App")
+  var sourceConnection = getConnection(true, sourceIsAstra, sourceScbPath, sourceHost, sourceUsername, sourcePassword, sourceReadConsistencyLevel,
+    sourceTrustStorePath, sourceTrustStorePassword, sourceTrustStoreType, sourceKeyStorePath, sourceKeyStorePassword, sourceEnabledAlgorithms);
+  analyzeSourceTable(sourceConnection)
+  exitSpark
+
+
+  private def getConnection(isSource: Boolean, isAstra: String, scbPath: String, host: String, username: String, password: String, readConsistencyLevel: String,
+                            trustStorePath: String, trustStorePassword: String, trustStoreType: String,
+                            keyStorePath: String, keyStorePassword: String, enabledAlgorithms: String): CassandraConnector = {
+    var connType: String = "Source"
+
+    if ("true".equals(isAstra)) {
+      abstractLogger.info(connType + ": Connected to Astra!");
+
+      return CassandraConnector(sc
+        .set("spark.cassandra.auth.username", username)
+        .set("spark.cassandra.auth.password", password)
+        .set("spark.cassandra.input.consistency.level", readConsistencyLevel)
+        .set("spark.cassandra.connection.config.cloud.path", scbPath))
+    } else if (null != trustStorePath && !trustStorePath.trim.isEmpty) {
+      abstractLogger.info(connType + ": Connected to Cassandra (or DSE) with SSL!");
+
+      // Use defaults when not provided
+      var enabledAlgorithmsVar = enabledAlgorithms
+      if (enabledAlgorithms == null || enabledAlgorithms.trim.isEmpty) {
+        enabledAlgorithmsVar = "TLS_RSA_WITH_AES_128_CBC_SHA, TLS_RSA_WITH_AES_256_CBC_SHA"
+      }
+
+      return CassandraConnector(sc
+        .set("spark.cassandra.auth.username", username)
+        .set("spark.cassandra.auth.password", password)
+        .set("spark.cassandra.input.consistency.level", readConsistencyLevel)
+        .set("spark.cassandra.connection.host", host)
+        .set("spark.cassandra.connection.ssl.enabled", "true")
+        .set("spark.cassandra.connection.ssl.enabledAlgorithms", enabledAlgorithmsVar)
+        .set("spark.cassandra.connection.ssl.trustStore.password", trustStorePassword)
+        .set("spark.cassandra.connection.ssl.trustStore.path", trustStorePath)
+        .set("spark.cassandra.connection.ssl.keyStore.password", keyStorePassword)
+        .set("spark.cassandra.connection.ssl.keyStore.path", keyStorePath)
+        .set("spark.cassandra.connection.ssl.trustStore.type", trustStoreType)
+        .set("spark.cassandra.connection.ssl.clientAuth.enabled", "true")
+      )
+    } else {
+      abstractLogger.info(connType + ": Connected to Cassandra (or DSE)!");
+
+      return CassandraConnector(sc.set("spark.cassandra.auth.username", username)
+        .set("spark.cassandra.auth.password", password)
+        .set("spark.cassandra.input.consistency.level", readConsistencyLevel)
+        .set("spark.cassandra.connection.host", host))
+    }
+
+  }
+
+  private def analyzeSourceTable(sourceConnection: CassandraConnector) = {
+    val partitions = SplitPartitions.getRandomSubPartitions(splitSize, minPartition, maxPartition, Integer.parseInt(coveragePercent))
+    logger.info("PARAM Calculated -- Total Partitions: " + partitions.size())
+    val parts = sContext.parallelize(partitions.toSeq,partitions.size);
+    logger.info("Spark parallelize created : " + parts.count() + " parts!");
+
+    parts.foreach(part => {
+      sourceConnection.withSessionDo(sourceSession =>
+        OriginCountJobSession.getInstance(sourceSession, sc)
+          .getData(part.getMin, part.getMax))
+    })
+
+  }
+
+}
+
+
+
+
diff --git a/src/resources/sparkConf.properties b/src/resources/sparkConf.properties
@@ -4,6 +4,12 @@ spark.origin.username                             some-username
 spark.origin.password                             some-secret-password
 spark.origin.read.consistency.level               LOCAL_QUORUM
 spark.origin.keyspaceTable                        test.a1
+spark.source.checkTableforColSize                 false
+spark.source.checkTableforColSize.cols            partition-key,clustering-key
+spark.source.checkTableforColSize.cols.types      9,1
+spark.source.FilterColumn                         test
+spark.source.FilterColumnIndex                    2
+spark.source.FilterColumnType                     6%16
 
 spark.target.isAstra                              true
 spark.target.scb                                  file:///aaa/bbb/secure-connect-enterprise.zip