initial implementation of extractor usage report (#124)

max-zilla · web-flow · commit 4f92b6167912 · 2021-01-06T16:05:32.000-06:00
* initial implementation of report

* Update CHANGELOG.md

* stub for db update script

* Update MongoSalatPlugin.scala

* simplify user lookup based on db update

* Updates for parity with script

* logic improvements

* Clean up output formatting

* Add duration to queue jobs

* User ID script fix dataset IDs, print counts in script
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/)
 and this project adheres to [Semantic Versioning](http://semver.org/).
 
+## Unreleased
+
+### Added
+- Added a new `/api/reports/metrics/extractors` report for summarizing extractor usage by user.
 
 ## 1.13.0 - 2020-12-02
 
@@ -33,6 +37,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
 ### Changed
 - Docker Images are now pushed to [github container registry](https://github.com/orgs/clowder-framework/packages)
 
+
 ## 1.12.0 - 2020-10-19
 **_Warning:_**
 - This update modifies the MongoDB schema. Make sure to start the application with `-DMONGOUPDATE=1`.
diff --git a/app/api/Reporting.scala b/app/api/Reporting.scala
@@ -11,10 +11,11 @@ import javax.inject.Inject
 import java.util.{Date, TimeZone}
 
 import services._
-import models.{Collection, Dataset, File, ProjectSpace, UUID, User, UserStatus}
-import util.Parsers
+import models.{Collection, Dataset, File, ProjectSpace, UUID, User, UserStatus, ExtractionJob}
 
-import scala.collection.mutable.ListBuffer
+import org.apache.commons.lang3.Range.between
+import scala.collection.mutable.{ListBuffer, Map => MutaMap}
+import util.Parsers
 
 
 /**
@@ -25,7 +26,8 @@ class Reporting @Inject()(selections: SelectionService,
                           files: FileService,
                           collections: CollectionService,
                           spaces: SpaceService,
-                          users: UserService) extends Controller with ApiController {
+                          users: UserService,
+                          extractions: ExtractionService) extends Controller with ApiController {
 
   val dateFormat = new java.text.SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'")
   dateFormat.setTimeZone(TimeZone.getTimeZone("GMT"))
@@ -500,4 +502,165 @@ class Reporting @Inject()(selections: SelectionService,
       "Content-Disposition" -> ("attachment; filename=SpaceStorage"+id.stringify+".csv")
     )
   }
+
+  private def determineJobType(jobMsg: String): String = {
+    if (jobMsg == "SUBMITTED")
+      "queue"
+    else
+      "work" // TODO: Better solution?
+  }
+
+  def extractorUsage(since: Option[String], until: Option[String]) = ServerAdminAction { implicit request =>
+    Logger.debug("Generating extraction metrics report")
+
+    /** This mapping is used to aggregate jobs.
+     * A job is considered some countable extraction duration. It has a jobType so
+     * we can attempt to differentiate "time in queue" from "time being processed".
+     *
+     * jobLookup: [
+     *  UserID -> [
+     *    UniqueJobKey -> {
+     *      jobs: [ list of jobs identical to current_job below ]
+     *      current_job: {
+     *        target      event.file_id (but can be a dataset ID or metadata ID in reality)
+     *        targetType  file/dataset/metadata
+     *        extractor   extractor id (e.g. ncsa.file.digest)
+     *        spaceId     id of space containing target
+     *        jobId       official job_id, if available
+     *        jobType     is this a queue event or an actual work event on a node? see determineJobType()
+     *        lastStatus  most recent event.status for the job
+     *        start       earliest event.start time from events in this job (event.end is often blank)
+     *        end         latest event.start time from events in this job (event.end is often blank)
+     *
+     *      }
+     *    }
+     */
+    val jobLookup: MutaMap[UUID,
+      MutaMap[String, (List[ExtractionJob], Option[ExtractionJob])]] = MutaMap.empty
+
+    val results = extractions.getIterator(true, since, until, None)
+    while (results.hasNext) {
+      val event = results.next
+
+      // Collect info to associate this event with a job if possible
+      val jobId = event.job_id match {
+        case Some(jid) => jid.stringify
+        case None => ""
+      }
+      val jobType = determineJobType(event.status)
+      val uniqueKey = event.file_id + " - " + event.extractor_id
+
+      // Add user and uniqueKey if they don't exist yet
+      if (!jobLookup.get(event.user_id).isDefined)
+        jobLookup(event.user_id) = MutaMap.empty
+      if (!jobLookup.get(event.user_id).get.get(uniqueKey).isDefined)
+        jobLookup(event.user_id)(uniqueKey) = (List.empty, None)
+
+      // If we don't have an ongoing job, or it's not same jobType, start a new ongoing job
+      var jobList    = jobLookup(event.user_id)(uniqueKey)._1
+      val currentJob = jobLookup(event.user_id)(uniqueKey)._2
+      val newJobBeginning = currentJob match {
+        case Some(currJob) => currJob.jobType != jobType
+        case None => true
+      }
+
+      if (newJobBeginning) {
+        // Determine parent details for new job - quick dataset check first, then file search
+        var spaces = ""
+        var resourceType = "file"
+        val parentDatasets = datasets.findByFileIdAllContain(event.file_id)
+        if (parentDatasets.length > 0) {
+          parentDatasets.foreach(ds => {
+            spaces = ds.spaces.mkString(",")
+            resourceType = "file"
+          })
+        } else {
+          datasets.get(event.file_id) match {
+            case Some(ds) => {
+              spaces = ds.spaces.mkString(",")
+              resourceType = "dataset"
+            }
+            case None => {}
+          }
+        }
+
+        // Push current job to jobs list (saying it ended at start of next stage) and make new job entry
+        if (currentJob.isDefined) {
+          jobList = jobList ::: List(currentJob.get.copy(end=event.start))
+        }
+        val newJob = ExtractionJob(event.file_id.stringify, resourceType, event.extractor_id, spaces, jobId, jobType, 1,
+          event.status, event.start, event.start)
+        jobLookup(event.user_id)(uniqueKey) = (jobList, Some(newJob))
+      } else {
+        // Don't overwrite DONE as final message in case we have small differences in timing of last extractor msg
+        var status = currentJob.get.lastStatus
+        if (status != "DONE") status = event.status
+        val updatedJob = currentJob.get.copy(statusCount=currentJob.get.statusCount+1, lastStatus=event.status, end=event.start)
+        jobLookup(event.user_id)(uniqueKey) = (jobList, Some(updatedJob))
+      }
+    }
+
+    var headerRow = true
+    val keyiter = jobLookup.keysIterator
+    val enum = Enumerator.generateM({
+      val chunk = if (headerRow) {
+        val headers = List("userid", "username", "email", "resource_id", "resource_type", "space_id", "extractor",
+          "job_id", "job_type", "status_count", "last_status", "start", "end", "duration_ms")
+        val header = "\""+headers.mkString("\",\"")+"\"\n"
+        headerRow = false
+        Some(header.getBytes("UTF-8"))
+      } else {
+        scala.concurrent.blocking {
+          if (keyiter.hasNext) {
+            val userid = keyiter.next
+
+            // Get pretty user info
+            var username = ""
+            var email = ""
+            users.get(userid) match {
+              case Some(u) => {
+                username = u.fullName
+                email = u.email.getOrElse("")
+              }
+              case None => {}
+            }
+
+            var content = ""
+            val userRecords = jobLookup(userid)
+            userRecords.keysIterator.foreach(jobkey => {
+              val jobHistory = userRecords(jobkey)
+              val jobList = jobHistory._1
+              val currJob = jobHistory._2
+              jobList.foreach(job => {
+                val duration = (job.end.getTime - job.start.getTime)
+                val row = List(userid.stringify, username, email, job.target, job.targetType, job.spaces, job.extractor,
+                  job.jobId, job.jobType, job.statusCount, job.lastStatus, job.start, job.end, duration)
+                if (duration > 0)
+                  content += "\""+row.mkString("\",\"")+"\"\n"
+              })
+              // current job if it was never "closed" and pushed to the jobList (most common case)
+              currJob match {
+                case Some(job) => {
+                  val duration = (job.end.getTime - job.start.getTime)
+                  val row = List(userid.stringify, username, email, job.target, job.targetType, job.spaces, job.extractor,
+                    job.jobId, job.jobType, job.statusCount, job.lastStatus, job.start, job.end, duration)
+                  if (duration > 0)
+                    content += "\""+row.mkString("\",\"")+"\"\n"
+                }
+                case None => {}
+              }
+            })
+            Some(content.getBytes("UTF-8"))
+          }
+          else None
+        }
+      }
+      Future(chunk)
+    })
+
+    Ok.chunked(enum.andThen(Enumerator.eof)).withHeaders(
+      "Content-Type" -> "text/csv",
+      "Content-Disposition" -> "attachment; filename=ExtractorMetrics.csv"
+    )
+  }
 }
diff --git a/app/models/Extraction.scala b/app/models/Extraction.scala
@@ -23,6 +23,20 @@ case class Extraction(
   user_id: UUID = User.anonymous.id
 )
 
+// Used in extraction report aggregation
+case class ExtractionJob(
+  target: String,
+  targetType: String,
+  extractor: String,
+  spaces: String,
+  jobId: String,
+  jobType: String,
+  statusCount: Int,
+  lastStatus: String,
+  start: Date,
+  end: Date
+)
+
 /**
  * Currently running extractor name
  */
diff --git a/app/services/ExtractionService.scala b/app/services/ExtractionService.scala
@@ -18,6 +18,8 @@ trait ExtractionService {
 
   def get(msgId: UUID): Option[Extraction]
 
+  def getIterator(userRequired: Boolean, since: Option[String], until: Option[String], user: Option[UUID]): Iterator[Extraction]
+
   def findById(resource: ResourceRef): List[Extraction]
 
   def findByExtractorIDBefore(extractorID: String, status: String, date: String, limit: Int): List[Extraction]
diff --git a/app/services/mongodb/MongoDBExtractionService.scala b/app/services/mongodb/MongoDBExtractionService.scala
@@ -3,17 +3,19 @@ package services.mongodb
 import java.text.SimpleDateFormat
 
 import services.ExtractionService
-import models.{UUID, Extraction, ExtractionGroup, ResourceRef}
+import models.{Extraction, ExtractionGroup, ResourceRef, UUID}
 import org.bson.types.ObjectId
 import play.api.Play.current
 import com.novus.salat.dao.ModelCompanion
 import com.novus.salat.dao.SalatDAO
 import MongoContext.context
 import com.mongodb.casbah.commons.MongoDBObject
 import java.util.Date
+
 import play.api.Logger
 import models.WebPageResource
 import com.mongodb.casbah.Imports._
+import util.Parsers
 
 /**
  * Use MongoDB to store extractions
@@ -37,6 +39,15 @@ class MongoDBExtractionService extends ExtractionService {
     Extraction.findOne(MongoDBObject("id" -> new ObjectId(msgId.stringify)))
   }
 
+  def getIterator(userRequired: Boolean, since: Option[String], until: Option[String], user: Option[UUID]): Iterator[Extraction] = {
+    var query = MongoDBObject()
+    if (userRequired) query = query ++ ("user_id" $exists true)
+    since.foreach(t => query = query ++ ("start" $gte Parsers.fromISO8601(t)))
+    until.foreach(t => query = query ++ ("start" $lte Parsers.fromISO8601(t)))
+    user.foreach(uid => query = query ++ ("user_id" -> new ObjectId(uid.stringify)))
+    Extraction.find(query).toIterator
+  }
+
   def findById(resource: ResourceRef): List[Extraction] = {
     Extraction.find(MongoDBObject("file_id" -> new ObjectId(resource.id.stringify))).toList
   }
diff --git a/app/services/mongodb/MongoSalatPlugin.scala b/app/services/mongodb/MongoSalatPlugin.scala
@@ -26,6 +26,7 @@ import services.filesystem.DiskByteStorageService
 import services.{AppConfigurationService, ByteStorageService, DI, MetadataService}
 
 import scala.collection.JavaConverters._
+import scala.collection.mutable.{Map => MutaMap}
 
 /**
  * Mongo Salat service.
@@ -1685,5 +1686,4 @@ class MongoSalatPlugin(app: Application) extends Plugin {
     }
     print("DONE")
   }
-
 }
diff --git a/conf/routes b/conf/routes
@@ -757,6 +757,8 @@ GET            /api/reports/metrics/collections
 GET            /api/reports/metrics/spaces                                              @api.Reporting.spaceMetrics()
 GET            /api/reports/metrics/users                                               @api.Reporting.userMetrics()
 GET            /api/reports/storage/spaces/:id                                          @api.Reporting.spaceStorage(id: UUID, since: Option[String] ?= None, until: Option[String] ?= None)
+GET            /api/reports/metrics/extractors                                          @api.Reporting.extractorUsage(since: Option[String] ?= None, until: Option[String] ?= None)
+
 
 # ----------------------------------------------------------------------
 # MISC./OTHER ENDPOINTS
diff --git a/scripts/reports/SummarizeExtractionsByUser.js b/scripts/reports/SummarizeExtractionsByUser.js
diff --git a/scripts/updates/UpdateUserId.js b/scripts/updates/UpdateUserId.js

Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,7 @@ import services.filesystem.DiskByteStorageService`
`26`	`26`	`import services.{AppConfigurationService, ByteStorageService, DI, MetadataService}`
`27`	`27`
`28`	`28`	`import scala.collection.JavaConverters._`
	`29`	`+import scala.collection.mutable.{Map => MutaMap}`
`29`	`30`
`30`	`31`	`/**`
`31`	`32`	`* Mongo Salat service.`
`@@ -1685,5 +1686,4 @@ class MongoSalatPlugin(app: Application) extends Plugin {`
`1685`	`1686`	`}`
`1686`	`1687`	`print("DONE")`
`1687`	`1688`	`}`
`1688`		`-`
`1689`	`1689`	`}`