Merge pull request #88 from datastax/feature/blank-timestamp-in-key

pravinbhat · web-flow · commit 39ab797d0f0b · 2023-02-22T09:20:30.000-05:00
Handle blank timestamp values in primary-key columns graciously
diff --git a/pom.xml b/pom.xml
@@ -8,7 +8,7 @@
 
   <properties>
     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
-    <revision>3.1.0</revision>
+    <revision>3.2.1</revision>
     <scala.version>2.12.17</scala.version>
     <scala.main.version>2.12</scala.main.version>
     <spark.version>3.3.1</spark.version>
diff --git a/src/main/java/datastax/astra/migrate/AbstractJobSession.java b/src/main/java/datastax/astra/migrate/AbstractJobSession.java
@@ -12,7 +12,8 @@
 
 import java.time.Duration;
 import java.time.Instant;
-import java.util.Map;
+import java.util.List;
+import java.util.Optional;
 import java.util.stream.IntStream;
 
 public class AbstractJobSession extends BaseJobSession {
@@ -25,11 +26,11 @@ protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession,
 
     protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession, SparkConf sc, boolean isJobMigrateRowsFromFile) {
         super(sc);
-        
+
         if (sourceSession == null) {
             return;
         }
-        
+
         this.sourceSession = sourceSession;
         this.astraSession = astraSession;
 
@@ -105,14 +106,14 @@ protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession,
         }
 
         String selectCols = Util.getSparkProp(sc, "spark.query.origin");
-        String partionKey = Util.getSparkProp(sc, "spark.query.origin.partitionKey");
+        String partitionKey = Util.getSparkProp(sc, "spark.query.origin.partitionKey");
         String sourceSelectCondition = Util.getSparkPropOrEmpty(sc, "spark.query.condition");
         if (!sourceSelectCondition.isEmpty() && !sourceSelectCondition.trim().toUpperCase().startsWith("AND")) {
             sourceSelectCondition = " AND " + sourceSelectCondition;
         }
 
         final StringBuilder selectTTLWriteTimeCols = new StringBuilder();
-        String[] allCols = selectCols.split(",");
+        allCols = selectCols.split(",");
         ttlCols.forEach(col -> {
             selectTTLWriteTimeCols.append(",ttl(" + allCols[col] + ")");
         });
@@ -138,8 +139,9 @@ protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession,
 
         String fullSelectQuery;
         if (!isJobMigrateRowsFromFile) {
-            fullSelectQuery = "select " + selectCols + selectTTLWriteTimeCols + " from " + sourceKeyspaceTable + " where token(" + partionKey.trim()
-                    + ") >= ? and token(" + partionKey.trim() + ") <= ?  " + sourceSelectCondition + " ALLOW FILTERING";
+            fullSelectQuery = "select " + selectCols + selectTTLWriteTimeCols + " from " + sourceKeyspaceTable +
+                    " where token(" + partitionKey.trim() + ") >= ? and token(" + partitionKey.trim() + ") <= ?  " +
+                    sourceSelectCondition + " ALLOW FILTERING";
         } else {
             fullSelectQuery = "select " + selectCols + selectTTLWriteTimeCols + " from " + sourceKeyspaceTable + " where " + insertBinds;
         }
@@ -181,6 +183,12 @@ protected AbstractJobSession(CqlSession sourceSession, CqlSession astraSession,
             }
             astraInsertStatement = astraSession.prepare(fullInsertQuery);
         }
+
+        // Handle rows with blank values for 'timestamp' data-type in primary-key fields
+        tsReplaceValStr = Util.getSparkPropOr(sc, "spark.target.replace.blankTimestampKeyUsingEpoch", "");
+        if (!tsReplaceValStr.isEmpty()) {
+            tsReplaceVal = Long.parseLong(tsReplaceValStr);
+        }
     }
 
     public BoundStatement bindInsert(PreparedStatement insertStatement, Row sourceRow, Row astraRow) {
@@ -199,21 +207,8 @@ public BoundStatement bindInsert(PreparedStatement insertStatement, Row sourceRo
         } else {
             int index = 0;
             for (index = 0; index < selectColTypes.size(); index++) {
-                MigrateDataType dataTypeObj = selectColTypes.get(index);
-                Class dataType = dataTypeObj.typeClass;
-
-                try {
-                    Object colData = getData(dataTypeObj, index, sourceRow);
-                    if (index < idColTypes.size() && colData == null && dataType == String.class) {
-                        colData = "";
-                    }
-                    boundInsertStatement = boundInsertStatement.set(index, colData, dataType);
-                } catch (NullPointerException e) {
-                    // ignore the exception for map values being null
-                    if (dataType != Map.class) {
-                        throw e;
-                    }
-                }
+                boundInsertStatement = getBoundStatement(sourceRow, boundInsertStatement, index, selectColTypes);
+                if (boundInsertStatement == null) return null;
             }
 
             if (!ttlCols.isEmpty()) {
@@ -246,12 +241,60 @@ public long getLargestWriteTimeStamp(Row sourceRow) {
     public BoundStatement selectFromAstra(PreparedStatement selectStatement, Row sourceRow) {
         BoundStatement boundSelectStatement = selectStatement.bind().setConsistencyLevel(readConsistencyLevel);
         for (int index = 0; index < idColTypes.size(); index++) {
-            MigrateDataType dataType = idColTypes.get(index);
-            boundSelectStatement = boundSelectStatement.set(index, getData(dataType, index, sourceRow),
-                    dataType.typeClass);
+            boundSelectStatement = getBoundStatement(sourceRow, boundSelectStatement, index, idColTypes);
+            if (boundSelectStatement == null) return null;
         }
 
         return boundSelectStatement;
     }
 
+    private BoundStatement getBoundStatement(Row sourceRow, BoundStatement boundSelectStatement, int index,
+                                             List<MigrateDataType> cols) {
+        MigrateDataType dataTypeObj = cols.get(index);
+        Object colData = getData(dataTypeObj, index, sourceRow);
+
+        // Handle rows with blank values in primary-key fields
+        if (index < idColTypes.size()) {
+            Optional<Object> optionalVal = handleBlankInPrimaryKey(index, colData, dataTypeObj.typeClass, sourceRow);
+            if (!optionalVal.isPresent()) {
+                return null;
+            }
+            colData = optionalVal.get();
+        }
+        boundSelectStatement = boundSelectStatement.set(index, colData, dataTypeObj.typeClass);
+        return boundSelectStatement;
+    }
+
+    protected Optional<Object> handleBlankInPrimaryKey(int index, Object colData, Class dataType, Row sourceRow) {
+        return handleBlankInPrimaryKey(index, colData, dataType, sourceRow, true);
+    }
+
+    protected Optional<Object> handleBlankInPrimaryKey(int index, Object colData, Class dataType, Row sourceRow, boolean logWarn) {
+        // Handle rows with blank values for 'String' data-type in primary-key fields
+        if (index < idColTypes.size() && colData == null && dataType == String.class) {
+            if (logWarn) {
+                logger.warn("For row with Key: {}, found String primary-key column {} with blank value",
+                        getKey(sourceRow), allCols[index]);
+            }
+            return Optional.of("");
+        }
+
+        // Handle rows with blank values for 'timestamp' data-type in primary-key fields
+        if (index < idColTypes.size() && colData == null && dataType == Instant.class) {
+            if (tsReplaceValStr.isEmpty()) {
+                logger.error("Skipping row with Key: {} as Timestamp primary-key column {} has invalid blank value. " +
+                        "Alternatively rerun the job with --conf spark.target.replace.blankTimestampKeyUsingEpoch=\"<fixed-epoch-value>\" " +
+                        "option to replace the blanks with a fixed timestamp value", getKey(sourceRow), allCols[index]);
+                return Optional.empty();
+            }
+            if (logWarn) {
+                logger.warn("For row with Key: {}, found Timestamp primary-key column {} with invalid blank value. " +
+                        "Using value {} instead", getKey(sourceRow), allCols[index], Instant.ofEpochSecond(tsReplaceVal));
+            }
+            return Optional.of(Instant.ofEpochSecond(tsReplaceVal));
+        }
+
+        return Optional.of(colData);
+    }
+
 }
diff --git a/src/main/java/datastax/astra/migrate/BaseJobSession.java b/src/main/java/datastax/astra/migrate/BaseJobSession.java
@@ -59,6 +59,10 @@ public abstract class BaseJobSession {
     protected Integer filterColIndex;
     protected String filterColValue;
 
+    protected String[] allCols;
+    protected String tsReplaceValStr;
+    protected long tsReplaceVal;
+
     protected BaseJobSession(SparkConf sc) {
         readConsistencyLevel = Util.mapToConsistencyLevel(Util.getSparkPropOrEmpty(sc, "spark.consistency.read"));
         writeConsistencyLevel = Util.mapToConsistencyLevel(Util.getSparkPropOrEmpty(sc, "spark.consistency.write"));
diff --git a/src/main/java/datastax/astra/migrate/CopyJobSession.java b/src/main/java/datastax/astra/migrate/CopyJobSession.java
@@ -95,8 +95,12 @@ public void getDataAndInsert(BigInteger min, BigInteger max) {
                             astraRow = astraReadResultSet.one();
                         }
 
-                        CompletionStage<AsyncResultSet> astraWriteResultSet = astraSession
-                                .executeAsync(bindInsert(astraInsertStatement, sourceRow, astraRow));
+                        BoundStatement bInsert = bindInsert(astraInsertStatement, sourceRow, astraRow);
+                        if (null == bInsert) {
+                            skipCnt++;
+                            continue;
+                        }
+                        CompletionStage<AsyncResultSet> astraWriteResultSet = astraSession.executeAsync(bInsert);
                         writeResults.add(astraWriteResultSet);
                         if (writeResults.size() > fetchSizeInRows) {
                             writeCnt += iterateAndClearWriteResults(writeResults, 1);
@@ -124,7 +128,12 @@ public void getDataAndInsert(BigInteger min, BigInteger max) {
                         }
 
                         writeLimiter.acquire(1);
-                        batchStatement = batchStatement.add(bindInsert(astraInsertStatement, sourceRow, null));
+                        BoundStatement bInsert = bindInsert(astraInsertStatement, sourceRow, null);
+                        if (null == bInsert) {
+                            skipCnt++;
+                            continue;
+                        }
+                        batchStatement = batchStatement.add(bInsert);
 
                         // if batch threshold is met, send the writes and clear the batch
                         if (batchStatement.size() >= batchSize) {
diff --git a/src/main/java/datastax/astra/migrate/DiffJobSession.java b/src/main/java/datastax/astra/migrate/DiffJobSession.java
@@ -2,6 +2,7 @@
 
 import com.datastax.oss.driver.api.core.CqlSession;
 import com.datastax.oss.driver.api.core.cql.AsyncResultSet;
+import com.datastax.oss.driver.api.core.cql.BoundStatement;
 import com.datastax.oss.driver.api.core.cql.ResultSet;
 import com.datastax.oss.driver.api.core.cql.Row;
 import com.datastax.oss.driver.api.core.data.UdtValue;
@@ -12,6 +13,7 @@
 import java.math.BigInteger;
 import java.util.HashMap;
 import java.util.Map;
+import java.util.Optional;
 import java.util.concurrent.CompletionStage;
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.stream.IntStream;
@@ -74,11 +76,15 @@ public void getDataAndDiff(BigInteger min, BigInteger max) {
                             printCounts(false);
                         }
 
-                        CompletionStage<AsyncResultSet> targetRowFuture = astraSession
-                                .executeAsync(selectFromAstra(astraSelectStatement, srcRow));
-                        srcToTargetRowMap.put(srcRow, targetRowFuture);
-                        if (srcToTargetRowMap.size() > fetchSizeInRows) {
-                            diffAndClear(srcToTargetRowMap);
+                        BoundStatement bSelect = selectFromAstra(astraSelectStatement, srcRow);
+                        if (null == bSelect) {
+                            skippedCounter.incrementAndGet();
+                        } else {
+                            CompletionStage<AsyncResultSet> targetRowFuture = astraSession.executeAsync(bSelect);
+                            srcToTargetRowMap.put(srcRow, targetRowFuture);
+                            if (srcToTargetRowMap.size() > fetchSizeInRows) {
+                                diffAndClear(srcToTargetRowMap);
+                            }
                         }
                     } else {
                         readCounter.incrementAndGet();
@@ -165,13 +171,20 @@ private void diff(Row sourceRow, Row astraRow) {
     private String isDifferent(Row sourceRow, Row astraRow) {
         StringBuffer diffData = new StringBuffer();
         IntStream.range(0, selectColTypes.size()).parallel().forEach(index -> {
-            MigrateDataType dataType = selectColTypes.get(index);
-            Object source = getData(dataType, index, sourceRow);
-            Object astra = getData(dataType, index, astraRow);
+            MigrateDataType dataTypeObj = selectColTypes.get(index);
+            Object source = getData(dataTypeObj, index, sourceRow);
+            if (index < idColTypes.size()) {
+                Optional<Object> optionalVal = handleBlankInPrimaryKey(index, source, dataTypeObj.typeClass, sourceRow, false);
+                if (optionalVal.isPresent()) {
+                    source = optionalVal.get();
+                }
+            }
+
+            Object astra = getData(dataTypeObj, index, astraRow);
 
-            boolean isDiff = dataType.diff(source, astra);
+            boolean isDiff = dataTypeObj.diff(source, astra);
             if (isDiff) {
-                if (dataType.typeClass.equals(UdtValue.class)) {
+                if (dataTypeObj.typeClass.equals(UdtValue.class)) {
                     String sourceUdtContent = ((UdtValue) source).getFormattedContents();
                     String astraUdtContent = ((UdtValue) astra).getFormattedContents();
                     if (!sourceUdtContent.equals(astraUdtContent)) {
diff --git a/src/resources/sparkConf.properties b/src/resources/sparkConf.properties
@@ -64,7 +64,7 @@ spark.query.types                                 9,1,4,3
 #############################################################################################################
 
 # ENABLE ONLY IF COLUMN NAMES ON TARGET IS DIFFERENT FROM ORIGIN (SCHEMA & DATA-TYPES MUST BE SAME)
-#spark.query.target                                partition-key,clustering-key,order-date,amount
+#spark.query.target                                comma-separated-partition-key,comma-separated-clustering-key,comma-separated-other-columns
 
 # The tool adds TTL & Writetime at row-level (not field-level).
 # The largest TTL & Writetime values are used if multiple indexes are listed (comma separated)