databrickslabs · neilbest-db · May 31, 2024 · Jun 4, 2024 · Jun 11, 2024 · Jun 11, 2024
diff --git a/build.sbt b/build.sbt
@@ -2,7 +2,7 @@ name := "overwatch"
 
 organization := "com.databricks.labs"
 
-version := "0.8.1.2"
+version := "0.8.2.0"
 
 scalaVersion := "2.12.12"
 scalacOptions ++= Seq("-Xmax-classfile-name", "78")
@@ -18,6 +18,7 @@ libraryDependencies += "com.databricks" % "dbutils-api_2.12" % "0.0.5" % Provide
 libraryDependencies += "com.amazonaws" % "aws-java-sdk-s3" % "1.11.595" % Provided
 libraryDependencies += "io.delta" % "delta-core_2.12" % "1.0.0" % Provided
 libraryDependencies += "org.scalaj" %% "scalaj-http" % "2.4.2"
+libraryDependencies += "com.lihaoyi" %% "sourcecode" % "0.4.1"
 
 //libraryDependencies += "org.apache.hive" % "hive-metastore" % "2.3.9"
 
@@ -51,4 +52,4 @@ assemblyMergeStrategy in assembly := {
   case PathList("META-INF", xs @ _*) => MergeStrategy.discard
   case x => MergeStrategy.first
 }
- assembly / assemblyOption := (assembly / assemblyOption).value.copy(includeScala = false)
+ assembly / assemblyOption := (assembly / assemblyOption).value.copy(includeScala = false)
diff --git a/src/main/scala/com/databricks/labs/overwatch/pipeline/ETLDefinition.scala b/src/main/scala/com/databricks/labs/overwatch/pipeline/ETLDefinition.scala
@@ -27,7 +27,19 @@ class ETLDefinition(
 
     val transformedDF = transforms.foldLeft(verifiedSourceDF) {
       case (df, transform) =>
-        df.transform(transform)
+
+	/* 
+ 	 * reverting Spark UI Job Group labels for now
+   	 *
+	 * TODO: enumerate the regressions this would introduce
+	 *       when the labels set by then platform are replaced
+	 *       this way.
+	 * df.sparkSession.sparkContext.setJobGroup(
+         *    s"${module.pipeline.config.workspaceName}:${module.moduleName}",
+         *    transform.toString)
+	 */
+
+        df.transform( transform)
     }
     write(transformedDF, module)
   }

diff --git a/src/main/scala/com/databricks/labs/overwatch/pipeline/Silver.scala b/src/main/scala/com/databricks/labs/overwatch/pipeline/Silver.scala
@@ -268,7 +268,11 @@ class Silver(_workspace: Workspace, _database: Database, _config: Config)
       )
   }
 
-  lazy private[overwatch] val jobRunsModule = Module(2011, "Silver_JobsRuns", this, Array(1004, 2010, 2014), shuffleFactor = 12.0)
+  lazy private[overwatch] val jobRunsModule =
+    Module( 2011, "Silver_JobsRuns", this, Array(1004, 2010, 2014), shuffleFactor = 12.0)
+      .withSparkOverrides( Map(
+        "spark.databricks.adaptive.autoOptimizeShuffle.enabled" -> "true"))
+
   lazy private val appendJobRunsProcess: () => ETLDefinition = {
     () =>
       ETLDefinition(

diff --git a/src/main/scala/com/databricks/labs/overwatch/pipeline/SilverTransforms.scala b/src/main/scala/com/databricks/labs/overwatch/pipeline/SilverTransforms.scala
@@ -10,8 +10,9 @@ import org.apache.spark.sql.functions._
 import org.apache.spark.sql.{Column, DataFrame}
 
 
-trait SilverTransforms extends SparkSessionWrapper {
+trait SilverTransforms extends SparkSessionWrapper with DataFrameSyntax {
 
+  import TransformationDescriber._
   import spark.implicits._
 
   private val logger: Logger = Logger.getLogger(this.getClass)
@@ -1305,71 +1306,178 @@ trait SilverTransforms extends SparkSessionWrapper {
       "runSucceeded", "runFailed", "runTriggered", "runNow", "runStart", "submitRun", "cancel", "repairRun"
     )
     val optimalCacheParts = Math.min(daysToProcess * getTotalCores * 2, 1000)
-    val jobRunsLag30D = getJobsBase(auditLogLag30D)
-      .filter('actionName.isin(jobRunActions: _*))
-      .repartition(optimalCacheParts)
-      .cache() // cached df removed at end of module run
-
-    // eagerly force this highly reused DF into cache()
-    jobRunsLag30D.count()
-
-    // Lookup to populate the clusterID/clusterName where missing from jobs
-    lazy val clusterSpecNameLookup = clusterSpec.asDF
-      .select('organization_id, 'timestamp, 'cluster_name, 'cluster_id.alias("clusterId"))
-      .filter('clusterId.isNotNull && 'cluster_name.isNotNull)
-
-    // Lookup to populate the clusterID/clusterName where missing from jobs
-    lazy val clusterSnapNameLookup = clusterSnapshot.asDF
-      .withColumn("timestamp", unix_timestamp('Pipeline_SnapTS) * lit(1000))
-      .select('organization_id, 'timestamp, 'cluster_name, 'cluster_id.alias("clusterId"))
-      .filter('clusterId.isNotNull && 'cluster_name.isNotNull)
-
-    // Lookup to populate the existing_cluster_id where missing from jobs -- it can be derived from name
-    lazy val jobStatusMetaLookup = jobsStatus.asDF
-      .verifyMinimumSchema(Schema.minimumJobStatusSilverMetaLookupSchema)
-      .select(
-        'organization_id,
-        'timestamp,
-        'jobId,
-        'jobName,
-        to_json('tags).alias("tags"),
-        'schedule,
-        'max_concurrent_runs,
-        'run_as_user_name,
-        'timeout_seconds,
-        'created_by,
-        'last_edited_by,
-        to_json('tasks).alias("tasks"),
-        to_json('job_clusters).alias("job_clusters"),
-        to_json($"task_detail_legacy.notebook_task").alias("notebook_task"),
-        to_json($"task_detail_legacy.spark_python_task").alias("spark_python_task"),
-        to_json($"task_detail_legacy.python_wheel_task").alias("python_wheel_task"),
-        to_json($"task_detail_legacy.spark_jar_task").alias("spark_jar_task"),
-        to_json($"task_detail_legacy.spark_submit_task").alias("spark_submit_task"),
-        to_json($"task_detail_legacy.shell_command_task").alias("shell_command_task"),
-        to_json($"task_detail_legacy.pipeline_task").alias("pipeline_task")
-      )
+    // val optimalCacheParts = Math.min( daysToProcess * 2, getTotalCores * 2)
 
-    lazy val jobSnapNameLookup = jobsSnapshot.asDF
-      .withColumn("timestamp", unix_timestamp('Pipeline_SnapTS) * lit(1000))
-      .select('organization_id, 'timestamp, 'job_id.alias("jobId"), $"settings.name".alias("jobName"))
+    logger.log( Level.INFO, s"optimalCacheParts: ${optimalCacheParts}")
 
-    val jobRunsLookups = jobRunsInitializeLookups(
-      (clusterSpec, clusterSpecNameLookup),
-      (clusterSnapshot, clusterSnapNameLookup),
-      (jobsStatus, jobStatusMetaLookup),
-      (jobsSnapshot, jobSnapNameLookup)
-    )
+    val filterAuditLog = NamedTransformation {
+      (df: DataFrame) => {
+        val cachedActions = getJobsBase( df)
+          .filter('actionName.isin( jobRunActions: _*))
+          .repartition( optimalCacheParts)
+          .cache() // cached df removed at end of module run
+
+        // eagerly force this highly reused DF into cache()
+        val n = cachedActions.count()
+
+        logger.log( Level.INFO, s"jobRunsLag30D count: ${n}")
+        logger.log( Level.INFO, s"jobRunsLag30D.rdd.partitions.size: ${cachedActions.rdd.partitions.size}")
+
+        cachedActions
+      }
+    }
+
+    val jobRunsLag30D = auditLogLag30D.transformWithDescription( filterAuditLog)
+
+    /*
+     * keep original usage example of `.showLines()` for reference
+     *
+     *   logger.log( Level.INFO, "Showing first 5 rows of `jobRunsLag30D`:")
+     *   jobRunsLag30D
+     *     .showLines(5, 20, true)
+     *     .foreach( logger.log( Level.INFO, _))
+     *
+     */
+
+
+    /**
+      * Look up the cluster_name based on id first from
+      * `job_status_silver`.  If not present there fallback to latest
+      * snapshot prior to the run
+      */
+
+    val jobRunsAppendClusterName = NamedTransformation {
+      (df: DataFrame) => {
+
+        val key = Seq( "organization_id", "clusterId")
+
+        lazy val clusterSpecNameLookup = clusterSpec.asDF
+          .select(
+            'organization_id,
+            'timestamp,
+            'cluster_name,
+            'cluster_id.alias("clusterId"))
+          .filter(
+            'clusterId.isNotNull
+              && 'cluster_name.isNotNull)
+          .toTSDF( "timestamp", key:_*)
+
+        lazy val clusterSnapNameLookup = clusterSnapshot.asDF
+          .select(
+            // .withColumn("timestamp", unix_timestamp('Pipeline_SnapTS) * lit(1000))
+            'organization_id,
+            ( unix_timestamp('Pipeline_SnapTS) * lit(1000)).alias( "timestamp"),
+            'cluster_name,
+            'cluster_id.alias( "clusterId"))
+          .filter(
+            'clusterId.isNotNull
+              && 'cluster_name.isNotNull)
+          .toTSDF( "timestamp", key:_*)
+
+        df.toTSDF( "timestamp", key:_*)
+          .lookupWhen( clusterSpecNameLookup)
+          .lookupWhen( clusterSnapNameLookup)
+          .df
+      }
+    }
+
+    /**
+      * looks up the job name based on id first from job_status_silver
+      * and if not present there fallback to latest snapshot prior to
+      * the run
+      */
+
+    val jobRunsAppendJobMeta = NamedTransformation {
+      (df: DataFrame) => {
+
+        val key = Seq( "organization_id", "jobId")
+
+        lazy val jobStatusMetaLookup = jobsStatus.asDF
+          .verifyMinimumSchema(
+            Schema.minimumJobStatusSilverMetaLookupSchema)
+          .select(
+            'organization_id,
+            'timestamp,
+            'jobId,
+            'jobName,
+            to_json('tags).alias("tags"),
+            'schedule,
+            'max_concurrent_runs,
+            'run_as_user_name,
+            'timeout_seconds,
+            'created_by,
+            'last_edited_by,
+            to_json( 'tasks).alias("tasks"),
+            to_json( 'job_clusters).alias("job_clusters"),
+            to_json( $"task_detail_legacy.notebook_task").alias( "notebook_task"),
+            to_json( $"task_detail_legacy.spark_python_task").alias( "spark_python_task"),
+            to_json( $"task_detail_legacy.python_wheel_task").alias( "python_wheel_task"),
+            to_json( $"task_detail_legacy.spark_jar_task").alias( "spark_jar_task"),
+            to_json( $"task_detail_legacy.spark_submit_task").alias( "spark_submit_task"),
+            to_json( $"task_detail_legacy.shell_command_task").alias( "shell_command_task"),
+            to_json( $"task_detail_legacy.pipeline_task").alias( "pipeline_task"))
+          .toTSDF( "timestamp", key:_*)
+
+        lazy val jobSnapshotNameLookup = jobsSnapshot.asDF
+          // .withColumn("timestamp", unix_timestamp('Pipeline_SnapTS) * lit(1000))
+          .select(
+            'organization_id,
+            ( unix_timestamp( 'Pipeline_SnapTS) * lit( 1000)).alias( "timestamp"),
+            'job_id.alias("jobId"),
+            $"settings.name".alias("jobName"))
+          .toTSDF( "timestamp", key:_*)
+
+        df.toTSDF( "timestamp", key:_*)
+          .lookupWhen( jobStatusMetaLookup)
+          .lookupWhen( jobSnapshotNameLookup)
+          .df
+          .withColumn( "jobName",
+            coalesce('jobName, 'run_name))
+          .withColumn( "tasks",
+            coalesce('tasks, 'submitRun_tasks))
+          .withColumn( "job_clusters",
+            coalesce('job_clusters, 'submitRun_job_clusters))
+
+        // the following is only valid in Spark > 3.1.2, apparently
+        // (it compiles with 3.3.0)
+
+          // .withColumns( Map(
+          //   "jobName"
+          //     -> coalesce('jobName, 'run_name),
+          //   "tasks"
+          //     -> coalesce('tasks, 'submitRun_tasks),
+          //   "job_clusters"
+          //     -> coalesce('job_clusters, 'submitRun_job_clusters)))
+      }
+    }
+
+
+
+    // val jobRunsLookups = jobRunsInitializeLookups(
+    //   (clusterSpec, clusterSpecNameLookup),
+    //   (clusterSnapshot, clusterSnapNameLookup),
+    //   (jobsStatus, jobStatusMetaLookup),
+    //   (jobsSnapshot, jobSnapNameLookup)
+    // )
 
     // caching before structifying
-    jobRunsDeriveRunsBase(jobRunsLag30D, etlUntilTime)
-      .transform(jobRunsAppendClusterName(jobRunsLookups))
-      .transform(jobRunsAppendJobMeta(jobRunsLookups))
-      .transform(jobRunsStructifyLookupMeta(optimalCacheParts))
-      .transform(jobRunsAppendTaskAndClusterDetails)
-      .transform(jobRunsCleanseCreatedNestedStructures(targetKeys))
-      //      .transform(jobRunsRollupWorkflowsAndChildren)
-      .drop("timestamp") // could be duplicated to enable asOf Lookups, dropping to clean up
+    val cachedDF = jobRunsLag30D
+      .transformWithDescription( jobRunsDeriveRunsBase( etlUntilTime))
+      .transformWithDescription( jobRunsAppendClusterName)
+      .transformWithDescription( jobRunsAppendJobMeta)
+      .repartition( optimalCacheParts)
+      .cache() // to speed up schema inference while "structifying" next
+
+    logger.log( Level.INFO, s"cachedDF count before structifying: ${cachedDF.count()}")
+    logger.log( Level.INFO, s"cachedDF.rdd.partitions.size: ${cachedDF.rdd.partitions.size}")
+
+    cachedDF
+      .transformWithDescription( jobRunsStructifyLookupMeta) // ( optimalCacheParts))
+      .transformWithDescription( jobRunsAppendTaskAndClusterDetails)
+      .transformWithDescription( jobRunsCleanseCreatedNestedStructures( targetKeys))
+      .drop("timestamp")
+    // `timestamp` could be duplicated to enable `asOf` lookups;
+    // dropping to clean up
   }
 
   protected def notebookSummary()(df: DataFrame): DataFrame = {