databrickslabs · neilbest-db · May 31, 2024 · Jun 4, 2024 · Jun 11, 2024 · Jun 11, 2024
diff --git a/build.sbt b/build.sbt
@@ -2,14 +2,17 @@ name := "overwatch"
 
 organization := "com.databricks.labs"
 
-version := "0.8.1.2"
+version := "0.8.2.0"
 
 scalaVersion := "2.12.12"
 scalacOptions ++= Seq("-Xmax-classfile-name", "78")
 
 Test / fork  := true
 Test / envVars := Map("OVERWATCH_ENV" -> " ","OVERWATCH_TOKEN" -> " ","OVERWATCH" -> " ")
 
+logBuffered in Test := false
+// parallelExecution in Test := false
+
 val sparkVersion = "3.1.2"
 libraryDependencies += "org.apache.spark" %% "spark-core" % sparkVersion % Provided
 libraryDependencies += "org.apache.spark" %% "spark-sql" % sparkVersion % Provided
@@ -18,6 +21,7 @@ libraryDependencies += "com.databricks" % "dbutils-api_2.12" % "0.0.5" % Provide
 libraryDependencies += "com.amazonaws" % "aws-java-sdk-s3" % "1.11.595" % Provided
 libraryDependencies += "io.delta" % "delta-core_2.12" % "1.0.0" % Provided
 libraryDependencies += "org.scalaj" %% "scalaj-http" % "2.4.2"
+libraryDependencies += "com.lihaoyi" %% "sourcecode" % "0.4.1"
 
 //libraryDependencies += "org.apache.hive" % "hive-metastore" % "2.3.9"
 
@@ -51,4 +55,4 @@ assemblyMergeStrategy in assembly := {
   case PathList("META-INF", xs @ _*) => MergeStrategy.discard
   case x => MergeStrategy.first
 }
- assembly / assemblyOption := (assembly / assemblyOption).value.copy(includeScala = false)
+ assembly / assemblyOption := (assembly / assemblyOption).value.copy(includeScala = false)
diff --git a/src/main/resources/Warehouse_DBU_Details.csv b/src/main/resources/Warehouse_DBU_Details.csv
@@ -0,0 +1,19 @@
+cloud,cluster_size,driver_size,worker_count,total_dbus
+AWS,2X-Small,i3.2xlarge,1,4
+AWS,X-Small,i3.2xlarge,2,6
+AWS,Small,i3.4xlarge,4,12
+AWS,Medium,i3.8xlarge,8,24
+AWS,Large,i3.8xlarge,16,40
+AWS,X-Large,i3.16xlarge,32,80
+AWS,2X-Large,i3.16xlarge,64,144
+AWS,3X-Large,i3.16xlarge,128,272
+AWS,4X-Large,i3.16xlarge,256,528
+AZURE,2X-Small,E8ds v4,1,4
+AZURE,X-Small,E8ds v4,2,6
+AZURE,Small,E16ds v4,4,12
+AZURE,Medium,E32ds v4,8,24
+AZURE,Large,E32ds v4,16,40
+AZURE,X-Large,E64ds v4,32,80
+AZURE,2X-Large,E64ds v4,64,144
+AZURE,3X-Large,E64ds v4,128,272
+AZURE,4X-Large,E64ds v4,256,528
diff --git a/src/main/scala/com/databricks/labs/overwatch/MultiWorkspaceDeployment.scala b/src/main/scala/com/databricks/labs/overwatch/MultiWorkspaceDeployment.scala
@@ -122,7 +122,7 @@ class MultiWorkspaceDeployment extends SparkSessionWrapper {
       val sqlComputerDBUPrice: Double = config.sql_compute_dbu_price
       val jobsLightDBUPrice: Double = config.jobs_light_dbu_price
       val customWorkspaceName: String = config.workspace_name
-      val standardScopes = "audit,sparkEvents,jobs,clusters,clusterEvents,notebooks,pools,accounts,dbsql,notebookCommands".split(",")
+      val standardScopes = OverwatchScope.toArray
       val scopesToExecute = (standardScopes.map(_.toLowerCase).toSet --
         config.excluded_scopes.getOrElse("").split(":").map(_.toLowerCase).toSet).toArray
 

diff --git a/src/main/scala/com/databricks/labs/overwatch/api/ApiMeta.scala b/src/main/scala/com/databricks/labs/overwatch/api/ApiMeta.scala
@@ -200,6 +200,32 @@ trait ApiMeta {
     jsonObject.toString
   }
 
+  /**
+   * Function will add the meta info to the api response.
+   *
+   * @param response
+   * @param jsonQuery
+   * @param queryMap
+   * @return a string containing the api response and the meta for the api call.
+   */
+  private[overwatch] def enrichAPIResponse(response: HttpResponse[String], jsonQuery: String, queryMap: Map[String, String]): String = {
+    val filter: String = if (apiCallType.equals("POST")) jsonQuery else {
+      val mapper = new ObjectMapper()
+      mapper.registerModule(DefaultScalaModule)
+      mapper.writeValueAsString(queryMap)
+    }
+    val jsonObject = new JSONObject();
+    val apiTraceabilityMeta = new JSONObject();
+    apiTraceabilityMeta.put("endPoint", apiName)
+    apiTraceabilityMeta.put("type", apiCallType)
+    apiTraceabilityMeta.put("apiVersion", apiV)
+    apiTraceabilityMeta.put("responseCode", response.code)
+    apiTraceabilityMeta.put("batchKeyFilter", filter)
+    jsonObject.put("rawResponse", response.body.trim)
+    jsonObject.put("apiTraceabilityMeta", apiTraceabilityMeta)
+    jsonObject.toString
+  }
+
 }
 
 /**

diff --git a/src/main/scala/com/databricks/labs/overwatch/env/Workspace.scala b/src/main/scala/com/databricks/labs/overwatch/env/Workspace.scala
@@ -17,6 +17,8 @@ import scala.concurrent.Future
 import scala.concurrent.forkjoin.ForkJoinPool
 import scala.util.{Failure, Success, Try}
 import scala.concurrent.ExecutionContext.Implicits.global
+import java.time.LocalDateTime
+import java.time.format.DateTimeFormatter
 
 
 /**
@@ -422,6 +424,40 @@ class Workspace(config: Config) extends SparkSessionWrapper {
     addReport
   }
 
+  /**
+   * Fetch the warehouse event data from system.compute.warehouse_events
+   * @param fromTime : from time to fetch the data
+   * @param untilTime: until time to fetch the data
+   * @param maxHistoryDays: maximum history days to fetch the data
+   * @return
+   */
+  def getWarehousesEventDF(fromTime: TimeTypes,
+                           untilTime: TimeTypes,
+                           config: Config,
+                           maxHistoryDays: Int = 30
+                           ): DataFrame = {
+    val sysTableFormat = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss.SSS")
+    val moduleFromTime = fromTime.asLocalDateTime.format(sysTableFormat)
+    val moduleUntilTime = untilTime.asLocalDateTime.format(sysTableFormat)
+    val useSystemTableMessage = "Use system tables as a source to audit logs"
+    val tableDoesNotExistsMessage = "Table system.compute.warehouse_events does not exists"
+
+    if(config.auditLogConfig.systemTableName.isEmpty)
+      throw new NoNewDataException(useSystemTableMessage, Level.WARN, allowModuleProgression = false)
+
+    if(!spark.catalog.tableExists("system.compute.warehouse_events"))
+      throw new NoNewDataException(tableDoesNotExistsMessage, Level.WARN, allowModuleProgression = false)
+
+    spark.sql(s"""
+        select * from system.compute.warehouse_events
+        WHERE workspace_id = '${config.organizationId}'
+        and event_time >= DATE_SUB('${moduleFromTime}', ${maxHistoryDays})
+        and event_time <= '${moduleUntilTime}'
+        """)
+      .withColumnRenamed("event_type","state")
+      .withColumnRenamed("workspace_id","organization_id")
+      .withColumnRenamed("event_time","timestamp")
+  }
 }
 
 

diff --git a/src/main/scala/com/databricks/labs/overwatch/pipeline/Bronze.scala b/src/main/scala/com/databricks/labs/overwatch/pipeline/Bronze.scala
@@ -104,8 +104,6 @@ class Bronze(_workspace: Workspace, _database: Database, _config: Config)
 
   }
 
-
-
   lazy private[overwatch] val jobsSnapshotModule = Module(1001, "Bronze_Jobs_Snapshot", this)
   lazy private val appendJobsProcess: () => ETLDefinition = {
     () =>
@@ -171,6 +169,7 @@ class Bronze(_workspace: Workspace, _database: Database, _config: Config)
         BronzeTargets.clustersSnapshotTarget.asDF,
         Seq(
           prepClusterEventLogs(
+            clusterEventLogsModule.isFirstRun,
             BronzeTargets.auditLogsTarget.asIncrementalDF(clusterEventLogsModule, BronzeTargets.auditLogsTarget.incrementalColumns, additionalLagDays = 1), // 1 lag day to get laggard records
             clusterEventLogsModule.fromTime,
             clusterEventLogsModule.untilTime,

diff --git a/src/main/scala/com/databricks/labs/overwatch/pipeline/BronzeTransforms.scala b/src/main/scala/com/databricks/labs/overwatch/pipeline/BronzeTransforms.scala
@@ -410,12 +410,21 @@ trait BronzeTransforms extends SparkSessionWrapper {
                                 pipelineSnapTime: Long,
                                 tmpClusterEventsSuccessPath: String,
                                 tmpClusterEventsErrorPath: String,
-                                config: Config) = {
+                                config: Config,
+                                isFirstRun: Boolean) = {
     val finalResponseCount = clusterIDs.length
     val clusterEventsEndpoint = "clusters/events"
 
     val lagTime =  86400000 //1 day
-    val lagStartTime = startTime.asUnixTimeMilli - lagTime
+
+    val lagStartTime = if (isFirstRun) {
+      logger.log(Level.INFO, "First run, acquiring all cluster events")
+      0.toLong
+    } else {
+      logger.log(Level.INFO, "Subsequent run, acquiring new cluster events")
+      startTime.asUnixTimeMilli - lagTime
+    }
+
     // creating Json input for parallel API calls
     val jsonInput = Map(
       "start_value" -> "0",
@@ -601,6 +610,7 @@ trait BronzeTransforms extends SparkSessionWrapper {
   }
 
   protected def prepClusterEventLogs(
+                                      isFirstRun : Boolean,
                                       filteredAuditLogDF: DataFrame,
                                       startTime: TimeTypes,
                                       endTime: TimeTypes,
@@ -626,18 +636,23 @@ trait BronzeTransforms extends SparkSessionWrapper {
 
     val tmpClusterEventsSuccessPath = s"${config.tempWorkingDir}/${apiEndpointTempDir}/success_" + pipelineSnapTS.asUnixTimeMilli
     val tmpClusterEventsErrorPath = s"${config.tempWorkingDir}/${apiEndpointTempDir}/error_" + pipelineSnapTS.asUnixTimeMilli
-    try{
-      landClusterEvents(clusterIDs, startTime, endTime, pipelineSnapTS.asUnixTimeMilli, tmpClusterEventsSuccessPath,
-        tmpClusterEventsErrorPath, config)
-    }catch {
+    try {
+      landClusterEvents(
+        clusterIDs, startTime, endTime,
+        pipelineSnapTS.asUnixTimeMilli,
+        tmpClusterEventsSuccessPath,
+        tmpClusterEventsErrorPath,
+        config,
+        isFirstRun)
+    } catch {
       case e: Throwable =>
         val errMsg = s"Error in landing cluster events: ${e.getMessage}"
         logger.log(Level.ERROR, errMsg)
         throw e
     }
      if (Helpers.pathExists(tmpClusterEventsErrorPath)) {
       persistErrors(
-        deriveRawApiResponseDF(spark.read.json(tmpClusterEventsErrorPath))
+        spark.read.json(tmpClusterEventsErrorPath)
           .withColumn("from_ts", toTS(col("from_epoch")))
           .withColumn("until_ts", toTS(col("until_epoch"))),
         database,
@@ -1375,4 +1390,4 @@ trait BronzeTransforms extends SparkSessionWrapper {
   }
 
 
-}
+}
diff --git a/src/main/scala/com/databricks/labs/overwatch/pipeline/ETLDefinition.scala b/src/main/scala/com/databricks/labs/overwatch/pipeline/ETLDefinition.scala
@@ -27,7 +27,20 @@ class ETLDefinition(
 
     val transformedDF = transforms.foldLeft(verifiedSourceDF) {
       case (df, transform) =>
-        df.transform(transform)
+
+	/* 
+ 	 * reverting Spark UI Job Group labels for now
+   	 *
+	 * TODO: enumerate the regressions this would introduce
+	 *       when the labels set by then platform are replaced
+	 *       this way.
+	 * df.sparkSession.sparkContext.setJobGroup(
+         *    s"${module.pipeline.config.workspaceName}:${module.moduleName}",
+         *    transform.toString)
+	 */
+
+        df.transform( transform)
+
     }
     write(transformedDF, module)
   }

diff --git a/src/main/scala/com/databricks/labs/overwatch/pipeline/Gold.scala b/src/main/scala/com/databricks/labs/overwatch/pipeline/Gold.scala
@@ -320,6 +320,24 @@ class Gold(_workspace: Workspace, _database: Database, _config: Config)
       )
   }
 
+  lazy private[overwatch] val warehouseStateFactModule = Module(3020, "Gold_WarehouseStateFact", this, Array(2022, 2021), 3.0)
+  lazy private val appendWarehouseStateFactProcess: () => ETLDefinition = {
+    () =>
+      ETLDefinition(
+        SilverTargets.warehousesStateDetailTarget.asIncrementalDF(
+          warehouseStateFactModule,
+          SilverTargets.warehousesStateDetailTarget.incrementalColumns,
+          GoldTargets.warehouseStateFactTarget.maxMergeScanDates
+        ),
+        Seq(buildWarehouseStateFact(
+          BronzeTargets.cloudMachineDetail,
+          BronzeTargets.warehouseDbuDetail,
+          SilverTargets.warehousesSpecTarget
+        )),
+        append(GoldTargets.warehouseStateFactTarget)
+      )
+  }
+
   private def processSparkEvents(): Unit = {
 
     sparkExecutorModule.execute(appendSparkExecutorProcess)
@@ -400,6 +418,10 @@ class Gold(_workspace: Workspace, _database: Database, _config: Config)
         notebookCommandsFactModule.execute(appendNotebookCommandsFactProcess)
         GoldTargets.notebookCommandsFactViewTarget.publish(notebookCommandsFactViewColumnMapping)
       }
+      case OverwatchScope.warehouseEvents => {
+        warehouseStateFactModule.execute(appendWarehouseStateFactProcess)
+        GoldTargets.warehouseStateFactViewTarget.publish(warehouseStateFactViewColumnMappings)
+      }
       case _ =>
     }
   }