databrickslabs · neilbest-db · May 31, 2024 · Jun 4, 2024 · Jun 11, 2024 · Jun 11, 2024
diff --git a/build.sbt b/build.sbt
@@ -10,8 +10,8 @@ scalacOptions ++= Seq("-Xmax-classfile-name", "78")
 Test / fork  := true
 Test / envVars := Map("OVERWATCH_ENV" -> " ","OVERWATCH_TOKEN" -> " ","OVERWATCH" -> " ")
 
-logBuffered in Test := false
-// parallelExecution in Test := false
+Test / logBuffered := false
+// Test / parallelExecution := false
 
 val sparkVersion = "3.1.2"
 libraryDependencies += "org.apache.spark" %% "spark-core" % sparkVersion % Provided

diff --git a/src/main/scala/com/databricks/labs/overwatch/env/Database.scala b/src/main/scala/com/databricks/labs/overwatch/env/Database.scala
@@ -272,7 +272,7 @@ class Database(config: Config) extends SparkSessionWrapper {
     val explicitDatePartitionCondition = if (datePartitionFields.nonEmpty & maxMergeScanDates.nonEmpty) {
       s" AND target.${datePartitionFields.get} in (${maxMergeScanDates.mkString("'", "', '", "'")})"
     } else ""
-    val mergeCondition: String = immutableColumns.map(k => s"updates.$k = target.$k").mkString(" AND ") + " " +
+    val mergeCondition: String = immutableColumns.map(k => s"updates.$k <=> target.$k").mkString(" AND ") + " " +
       s"AND target.organization_id = '${config.organizationId}'" + // force partition filter for concurrent merge
       explicitDatePartitionCondition // force right side scan to only scan relevant dates
 

diff --git a/src/main/scala/com/databricks/labs/overwatch/pipeline/ETLDefinition.scala b/src/main/scala/com/databricks/labs/overwatch/pipeline/ETLDefinition.scala
@@ -27,6 +27,7 @@ class ETLDefinition(
 
     val transformedDF = transforms.foldLeft(verifiedSourceDF) {
       case (df, transform) =>
+
 	/* 
  	 * reverting Spark UI Job Group labels for now
    	 *
@@ -37,8 +38,9 @@ class ETLDefinition(
          *    s"${module.pipeline.config.workspaceName}:${module.moduleName}",
          *    transform.toString)
 	 */
-
-	df.transform( transform)
+
+        df.transform( transform)
+
     }
     write(transformedDF, module)
   }

diff --git a/src/main/scala/com/databricks/labs/overwatch/pipeline/GoldTransforms.scala b/src/main/scala/com/databricks/labs/overwatch/pipeline/GoldTransforms.scala
@@ -78,6 +78,7 @@ trait GoldTransforms extends SparkSessionWrapper {
       'git_source,
       'timeout_seconds,
       'max_concurrent_runs,
+      'queue,
       'max_retries,
       'retry_on_timeout,
       'min_retry_interval_millis,
@@ -141,17 +142,16 @@ trait GoldTransforms extends SparkSessionWrapper {
       'min_retry_interval_millis,
       'max_concurrent_runs,
       'run_as_user_name,
-//      'children,
-//      'workflow_children,
       'workflow_context,
       'task_detail_legacy,
       'submitRun_details,
       'created_by,
       'last_edited_by,
       'requestDetails.alias("request_detail"),
       'timeDetails.alias("time_detail"),
-      'startEpochMS
-    )
+      'startEpochMS,
+      'startTaskEpochMS)
+
     jobRunsLag30D
       .select(jobRunCols: _*)
   }
@@ -1053,7 +1053,7 @@ trait GoldTransforms extends SparkSessionWrapper {
   protected val jobViewColumnMapping: String =
     """
       |organization_id, workspace_name, job_id, action, date, timestamp, job_name, tags, tasks, job_clusters,
-      |libraries, timeout_seconds, max_concurrent_runs, max_retries, retry_on_timeout, min_retry_interval_millis,
+      |libraries, timeout_seconds, max_concurrent_runs, queue, max_retries, retry_on_timeout, min_retry_interval_millis,
       |schedule, existing_cluster_id, new_cluster, git_source, task_detail_legacy, is_from_dlt, aclPermissionSet,
       |targetUserId, session_id, request_id, user_agent, response, source_ip_address, created_by, created_ts,
       |deleted_by, deleted_ts, last_edited_by, last_edited_ts
@@ -1065,7 +1065,7 @@ trait GoldTransforms extends SparkSessionWrapper {
       |task_run_id, repair_id, task_key, cluster_type, cluster_id, cluster_name, job_cluster_key, job_cluster,
       |new_cluster, tags, task_detail, task_dependencies, task_runtime, task_execution_runtime, task_type,
       |terminal_state, job_trigger_type, schedule, libraries, manual_override_params, repair_details, timeout_seconds,
-      |retry_on_timeout, max_retries, min_retry_interval_millis, max_concurrent_runs, run_as_user_name, parent_run_id,
+      |retry_on_timeout, max_retries, min_retry_interval_millis, max_concurrent_runs, queue, run_as_user_name, parent_run_id,
       |workflow_context, task_detail_legacy, submitRun_details, created_by, last_edited_by, request_detail, time_detail
       |""".stripMargin
 

diff --git a/src/main/scala/com/databricks/labs/overwatch/pipeline/PipelineTargets.scala b/src/main/scala/com/databricks/labs/overwatch/pipeline/PipelineTargets.scala
@@ -321,10 +321,15 @@ abstract class PipelineTargets(config: Config) {
 
     lazy private[overwatch] val dbJobRunsTarget: PipelineTable = PipelineTable(
       name = "jobrun_silver",
-      _keys = Array("runId", "startEpochMS"),
+      _keys = Array(
+        "runId",
+        "startEpochMS",                 // was incorrectly equal to `$"timeDetails.startTime"` through 0.8.1.2
+                                        // via incorrect expression for `'TaskRunTime`; now `$"timeDetails.submissionTime"`
+        "startTaskEpochMS"              // added to make key for task runs complete; can be null
+      ),
       config,
       _mode = WriteMode.merge,
-      incrementalColumns = Array("startEpochMS"), // don't load into gold until run is terminated
+      incrementalColumns = Array("startEpochMS"),
       zOrderBy = Array("runId", "jobId"),
       partitionBy = Seq("organization_id", "__overwatch_ctrl_noise"),
       persistBeforeWrite = true,

diff --git a/src/main/scala/com/databricks/labs/overwatch/pipeline/Schema.scala b/src/main/scala/com/databricks/labs/overwatch/pipeline/Schema.scala
@@ -136,6 +136,8 @@ object Schema extends SparkSessionWrapper {
         StructField("name", StringType, nullable = true),
         StructField("timeout_seconds", LongType, nullable = true),
         StructField("max_concurrent_runs", LongType, nullable = true),
+        StructField("queue", StringType, nullable = true),
+        StructField("all_queued_runs", BooleanType, nullable = true),
         StructField("max_retries", LongType, nullable = true),
         StructField("retry_on_timeout", BooleanType, nullable = true),
         StructField("min_retry_interval_millis", LongType, nullable = true),
@@ -464,6 +466,9 @@ object Schema extends SparkSessionWrapper {
     StructField("timezone_id", StringType, true)
   ))
 
+  val minimumQueueSchema: StructType = StructType(Seq(
+    StructField("enabled", BooleanType, nullable = true)))
+
   val minimumEmailNotificationsSchema: StructType = StructType(Seq(
     StructField("no_alert_for_skipped_runs", BooleanType, nullable = true),
     StructField("on_failure", ArrayType(StringType, containsNull = true), nullable = true)
@@ -502,20 +507,21 @@ object Schema extends SparkSessionWrapper {
   )))
 
   // minimum new jobs settings struct
-  val minimumNewSettingsSchema: StructType = StructType(Seq(
-    StructField("existing_cluster_id", StringType, nullable = true),
-    StructField("max_concurrent_runs", LongType, nullable = true),
-    StructField("name", StringType, nullable = true),
-    StructField("new_cluster", minimumNewClusterSchema, nullable = true),
-    StructField("timeout_seconds", LongType, nullable = true),
-    StructField("notebook_task", minimumNotebookTaskSchema, nullable = true),
-    StructField("spark_python_task", minimumSparkPythonTaskSchema, nullable = true),
-    StructField("python_wheel_task", minimumPythonWheelTaskSchema, nullable = true),
-    StructField("spark_jar_task", minimumSparkJarTaskSchema, nullable = true),
-    StructField("spark_submit_task", minimumSparkSubmitTaskSchema, nullable = true),
-    StructField("shell_command_task", minimumShellCommandTaskSchema, nullable = true),
-    StructField("pipeline_task", minimumPipelineTaskSchema, nullable = true),
-  ))
+  // (not used; no references)
+  // val minimumNewSettingsSchema: StructType = StructType(Seq(
+  //   StructField("existing_cluster_id", StringType, nullable = true),
+  //   StructField("max_concurrent_runs", LongType, nullable = true),
+  //   StructField("name", StringType, nullable = true),
+  //   StructField("new_cluster", minimumNewClusterSchema, nullable = true),
+  //   StructField("timeout_seconds", LongType, nullable = true),
+  //   StructField("notebook_task", minimumNotebookTaskSchema, nullable = true),
+  //   StructField("spark_python_task", minimumSparkPythonTaskSchema, nullable = true),
+  //   StructField("python_wheel_task", minimumPythonWheelTaskSchema, nullable = true),
+  //   StructField("spark_jar_task", minimumSparkJarTaskSchema, nullable = true),
+  //   StructField("spark_submit_task", minimumSparkSubmitTaskSchema, nullable = true),
+  //   StructField("shell_command_task", minimumShellCommandTaskSchema, nullable = true),
+  //   StructField("pipeline_task", minimumPipelineTaskSchema, nullable = true),
+  // ))
 
   val minimumJobStatusSilverMetaLookupSchema: StructType = StructType(Seq(
     StructField("organization_id", StringType, nullable = false),
@@ -527,6 +533,7 @@ object Schema extends SparkSessionWrapper {
     StructField("tags", MapType(StringType, StringType, valueContainsNull = true), nullable = true),
     StructField("schedule", minimumScheduleSchema, nullable = true),
     StructField("max_concurrent_runs", LongType, nullable = true),
+    StructField("queue", minimumQueueSchema, nullable = true),
     StructField("run_as_user_name", StringType, nullable = true),
     StructField("timeout_seconds", LongType, nullable = true),
     StructField("created_by", StringType, nullable = true),
@@ -535,23 +542,24 @@ object Schema extends SparkSessionWrapper {
   ))
 
   // simplified new settings struct
-  private[overwatch] val simplifiedNewSettingsSchema = StructType(Seq(
-    StructField("email_notifications", minimumEmailNotificationsSchema, nullable = true),
-    StructField("existing_cluster_id", StringType, nullable = true),
-    StructField("max_concurrent_runs", LongType, nullable = true),
-    StructField("name", StringType, nullable = true),
-    StructField("new_cluster", minimumNewClusterSchema, nullable = true),
-    StructField("notebook_task", minimumNotebookTaskSchema, nullable = true),
-    StructField("schedule", minimumScheduleSchema, nullable = true),
-    StructField("notebook_task",minimumNotebookTaskSchema, nullable = true),
-    StructField("spark_python_task",minimumSparkPythonTaskSchema, nullable = true),
-    StructField("python_wheel_task", minimumPythonWheelTaskSchema, nullable = true),
-    StructField("spark_jar_task",minimumSparkJarTaskSchema, nullable = true),
-    StructField("spark_submit_task", minimumSparkSubmitTaskSchema, nullable = true),
-    StructField("shell_command_task",minimumShellCommandTaskSchema, nullable = true),
-    StructField("pipeline_task", minimumPipelineTaskSchema, nullable = true),
-    StructField("timeout_seconds", LongType, nullable = true)
-  ))
+  // (not used; no references)
+  // private[overwatch] val simplifiedNewSettingsSchema = StructType(Seq(
+  //   StructField("email_notifications", minimumEmailNotificationsSchema, nullable = true),
+  //   StructField("existing_cluster_id", StringType, nullable = true),
+  //   StructField("max_concurrent_runs", LongType, nullable = true),
+  //   StructField("name", StringType, nullable = true),
+  //   StructField("new_cluster", minimumNewClusterSchema, nullable = true),
+  //   StructField("notebook_task", minimumNotebookTaskSchema, nullable = true),
+  //   StructField("schedule", minimumScheduleSchema, nullable = true),
+  //   StructField("notebook_task",minimumNotebookTaskSchema, nullable = true),
+  //   StructField("spark_python_task",minimumSparkPythonTaskSchema, nullable = true),
+  //   StructField("python_wheel_task", minimumPythonWheelTaskSchema, nullable = true),
+  //   StructField("spark_jar_task",minimumSparkJarTaskSchema, nullable = true),
+  //   StructField("spark_submit_task", minimumSparkSubmitTaskSchema, nullable = true),
+  //   StructField("shell_command_task",minimumShellCommandTaskSchema, nullable = true),
+  //   StructField("pipeline_task", minimumPipelineTaskSchema, nullable = true),
+  //   StructField("timeout_seconds", LongType, nullable = true)
+  // ))
 
   val streamingGoldMinimumSchema: StructType = StructType(Seq(
     StructField("cluster_id", StringType, nullable = false),
@@ -717,6 +725,7 @@ object Schema extends SparkSessionWrapper {
       StructField("libraries", minimumLibrariesSchema, nullable = true),
       StructField("git_source", minimumGitSourceSchema, nullable = true),
       StructField("max_concurrent_runs", LongType, nullable = true),
+      StructField("queue", minimumQueueSchema, nullable = true),
       StructField("max_retries", LongType, nullable = true),
       StructField("timeout_seconds", LongType, nullable = true),
       StructField("retry_on_timeout", BooleanType, nullable = true),
@@ -846,6 +855,7 @@ object Schema extends SparkSessionWrapper {
       StructField("git_source", minimumGitSourceSchema, nullable = true),
       StructField("timeout_seconds", LongType, nullable = true),
       StructField("max_concurrent_runs", LongType, nullable = true),
+      StructField("queue", minimumQueueSchema, nullable = true),
       StructField("max_retries", LongType, nullable = true),
       StructField("retry_on_timeout", BooleanType, nullable = true),
       StructField("min_retry_interval_millis", LongType, nullable = true),
@@ -877,6 +887,7 @@ object Schema extends SparkSessionWrapper {
       StructField("jobId", LongType, nullable = true),
       StructField("runId", LongType, nullable = false),
       StructField("startEpochMS", LongType, nullable = false),
+      StructField("startTaskEpochMS", LongType, nullable = false),
       StructField("jobName", StringType, nullable = true),
       StructField("tags", MapType(StringType, StringType), nullable = true),
       StructField("jobRunId", LongType, nullable = true),
@@ -908,6 +919,7 @@ object Schema extends SparkSessionWrapper {
       StructField("max_retries", LongType, nullable = true),
       StructField("min_retry_interval_millis", LongType, nullable = true),
       StructField("max_concurrent_runs", LongType, nullable = true),
+      StructField("queue", minimumQueueSchema, nullable = true),
       StructField("run_as_user_name", StringType, nullable = true),
       StructField("workflow_context", StringType, nullable = true),
       StructField("task_detail_legacy", minimumTaskDetailSchema, nullable = true),