0812 release (#1249)

gueniai · aman-db · souravbaner-da · web-flow · commit f9c8dd088ea0 · 2024-06-24T11:28:15.000-04:00
* Initial commit * adding fix for schemaScrubber and StructToMap (#1232) * fix for null driver_type_id and node_type_id in jrcp (#1236) * Modify Cluster_snapshot_bronze column (#1234) * Comvert all the struct field inside 'spec' column for cluster_snapshot_bronze to mapType * Dropped Spec column from snapshot * Removed Reductant VerifyMinSchema * Update_AWS_instance_types (#1248) * Update_gcp_instance_types (#1244) Update_gcp_instance_types * Update_AWS_instance_types Update_AWS_instance_types --------- Co-authored-by: Aman <91308367+aman-db@users.noreply.github.com> Co-authored-by: Sourav Banerjee <109206082+souravbaner-da@users.noreply.github.com> Co-authored-by: Mohan Baabu <87074323+mohanbaabu1996@users.noreply.github.com>
diff --git a/build.sbt b/build.sbt
@@ -2,7 +2,7 @@ name := "overwatch"
 
 organization := "com.databricks.labs"
 
-version := "0.8.1.1"
+version := "0.8.1.2"
 
 scalaVersion := "2.12.12"
 scalacOptions ++= Seq("-Xmax-classfile-name", "78")
diff --git a/src/main/resources/AWS_Instance_Details.csv b/src/main/resources/AWS_Instance_Details.csv
diff --git a/src/main/scala/com/databricks/labs/overwatch/pipeline/BronzeTransforms.scala b/src/main/scala/com/databricks/labs/overwatch/pipeline/BronzeTransforms.scala
@@ -552,46 +552,17 @@ trait BronzeTransforms extends SparkSessionWrapper {
 
         val rawDF = deriveRawApiResponseDF(spark.read.json(tmpClusterSnapshotSuccessPath))
         if (rawDF.columns.contains("cluster_id")) {
-          val outputDF = SchemaScrubber.scrubSchema(rawDF)
-          val finalDF = outputDF.withColumn("default_tags", SchemaTools.structToMap(outputDF, "default_tags"))
-            .withColumn("custom_tags", SchemaTools.structToMap(outputDF, "custom_tags"))
-            .withColumn("spark_conf", SchemaTools.structToMap(outputDF, "spark_conf"))
-            .withColumn("spark_env_vars", SchemaTools.structToMap(outputDF, "spark_env_vars"))
-            .withColumn(s"aws_attributes", SchemaTools.structToMap(outputDF, s"aws_attributes"))
-            .withColumn(s"azure_attributes", SchemaTools.structToMap(outputDF, s"azure_attributes"))
-            .withColumn(s"gcp_attributes", SchemaTools.structToMap(outputDF, s"gcp_attributes"))
+          val scrubbedDF = SchemaScrubber.scrubSchema(rawDF)
+          val df = scrubbedDF.withColumn("default_tags", SchemaTools.structToMap(scrubbedDF, "default_tags"))
+            .withColumn("custom_tags", SchemaTools.structToMap(scrubbedDF, "custom_tags"))
+            .withColumn("spark_conf", SchemaTools.structToMap(scrubbedDF, "spark_conf"))
+            .withColumn("spark_env_vars", SchemaTools.structToMap(scrubbedDF, "spark_env_vars"))
+            .withColumn(s"aws_attributes", SchemaTools.structToMap(scrubbedDF, s"aws_attributes"))
+            .withColumn(s"azure_attributes", SchemaTools.structToMap(scrubbedDF, s"azure_attributes"))
+            .withColumn(s"gcp_attributes", SchemaTools.structToMap(scrubbedDF, s"gcp_attributes"))
             .withColumn("organization_id", lit(config.organizationId))
-            .verifyMinimumSchema(clusterSnapMinimumSchema)
-
-          val explodedDF = finalDF
-            .withColumnRenamed("custom_tags", "custom_tags_old")
-            .selectExpr("*", "spec.custom_tags")
-
-          val normalizedDf = explodedDF.withColumn("custom_tags", SchemaTools.structToMap(explodedDF, "custom_tags"))
-
-          // Replace the custom_tags field inside the spec struct with custom_tags outside of spec column
-          val updatedDf = normalizedDf.schema.fields.find(_.name == "spec") match {
-            case Some(field) =>
-              field.dataType match {
-                case structType: StructType =>
-                  // Create a new struct expression, replacing the specified field with the new column
-                  val newFields = structType.fields.map { f =>
-                    if (f.name.equalsIgnoreCase("custom_tags")) {
-                      col("custom_tags").as("custom_tags") // Replace with new column if names match
-                    } else {
-                      col(s"spec.${f.name}") // Keep existing fields as is
-                    }
-                  }
-                  // Update the DataFrame with the new struct replacing the old one
-                  normalizedDf.withColumn("spec", struct(newFields: _*))
-                case _ => normalizedDf // No action if the specified structColName is not a struct type
-              }
-            case None => normalizedDf // No action if the specified structColName does not exist
-          }
-
-          updatedDf.drop("custom_tags")
-            .withColumnRenamed("custom_tags_old", "custom_tags")
-
+            .drop("spec")
+          df.verifyMinimumSchema(clusterSnapMinimumSchema)
         } else {
           throw new NoNewDataException(msg, Level.WARN, true)
         }
diff --git a/src/main/scala/com/databricks/labs/overwatch/pipeline/WorkflowsTransforms.scala b/src/main/scala/com/databricks/labs/overwatch/pipeline/WorkflowsTransforms.scala
@@ -1325,10 +1325,10 @@ object WorkflowsTransforms extends SparkSessionWrapper {
       clusterStateEndOrPipelineEnd.alias("unixTimeMS_state_end"), // if clusterState still open -- close it for calculations
       'timestamp_state_start,
       'timestamp_state_end, 'state, 'cloud_billable, 'databricks_billable, 'uptime_in_state_H, 'current_num_workers, 'target_num_workers,
-      $"driverSpecs.API_Name".alias("driver_node_type_id"),
+      coalesce('driver_node_type_id, $"driverSpecs.API_Name").alias("driver_node_type_id"),
       $"driverSpecs.Compute_Contract_Price".alias("driver_compute_hourly"),
       $"driverSpecs.Hourly_DBUs".alias("driver_dbu_hourly"),
-      $"workerSpecs.API_Name".alias("node_type_id"),
+      coalesce('node_type_id, $"workerSpecs.API_Name").alias("node_type_id"),
       $"workerSpecs.Compute_Contract_Price".alias("worker_compute_hourly"),
       $"workerSpecs.Hourly_DBUs".alias("worker_dbu_hourly"),
       $"workerSpecs.vCPUs".alias("worker_cores"),
diff --git a/src/main/scala/com/databricks/labs/overwatch/utils/SchemaScrubber.scala b/src/main/scala/com/databricks/labs/overwatch/utils/SchemaScrubber.scala
@@ -86,10 +86,12 @@ class SchemaScrubber(
         s"DUPLICATE FIELDS:\n" +
         s"${dups.mkString("\n")}"
       logger.log(Level.WARN, warnMsg)
+      val counterMap = scala.collection.mutable.Map[String, Int]().withDefaultValue(0)
       fields.map(f => {
-        val fieldName = if (caseSensitive) f.sanitizedField.name else f.sanitizedField.name.toLowerCase
+        val fieldName = if (caseSensitive) f.sanitizedField.name.trim else f.sanitizedField.name.toLowerCase.trim
         if (dups.contains(fieldName)) {
-          val generatedUniqueName = f.sanitizedField.name + "_UNIQUESUFFIX_" + f.originalField.name.hashCode.toString
+          counterMap(fieldName) += 1
+          val generatedUniqueName = f.sanitizedField.name.trim + "_UNIQUESUFFIX_" + f.originalField.name.trim.hashCode.toString + "_" + counterMap(fieldName)
           val uniqueColumnMapping = s"\n${f.originalField.name} --> ${generatedUniqueName}"
           logger.log(Level.WARN, uniqueColumnMapping)
           f.sanitizedField.copy(name = generatedUniqueName)
diff --git a/src/test/scala/com/databricks/labs/overwatch/utils/SchemaToolsTest.scala b/src/test/scala/com/databricks/labs/overwatch/utils/SchemaToolsTest.scala
@@ -270,8 +270,8 @@ class SchemaToolsTest extends AnyFunSpec with SparkSessionTestWrapper with Given
 
       val expectedResString = "`b_2_2_2` STRUCT<`abc`: STRING, `c_1__45`: BIGINT>,`exception_parent` " +
         "STRUCT<`dup1`: BIGINT, `dup2`: BIGINT, `xyz`: STRUCT<`_mixed`: BIGINT, `_bad`: BIGINT, " +
-        "`dup1_UNIQUESUFFIX_95946320`: BIGINT, `dup1_UNIQUESUFFIX_95946320`: BIGINT, `dup2_UNIQUESUFFIX_3095059`: " +
-        "BIGINT, `dup2_UNIQUESUFFIX_3095059`: STRING, `good_col`: BIGINT, `jkl`: BIGINT, `otherexcept`: BIGINT>, " +
+        "`dup1_UNIQUESUFFIX_95946320_1`: BIGINT, `dup1_UNIQUESUFFIX_95946320_2`: BIGINT, `dup2_UNIQUESUFFIX_3095059_1`: " +
+        "BIGINT, `dup2_UNIQUESUFFIX_3095059_2`: STRING, `good_col`: BIGINT, `jkl`: BIGINT, `otherexcept`: BIGINT>, " +
         "`zyx`: BIGINT>,`i_1` BIGINT,`parentwspace` STRING,`validParent` STRING"
       val ddlFromLogic = df.scrubSchema(exceptionScrubber).schema.toDDL
       assertResult(expectedResString) {