clowder-framework
diff --git a/‎CHANGELOG.md‎
Lines changed: 10 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎app/api/Reporting.scala‎
Lines changed: 174 additions & 7 deletions b/‎app/api/Reporting.scala‎
Lines changed: 174 additions & 7 deletions
diff --git a/‎app/models/Extraction.scala‎
Lines changed: 14 additions & 0 deletions b/‎app/models/Extraction.scala‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎app/services/DatasetService.scala‎
Lines changed: 2 additions & 1 deletion b/‎app/services/DatasetService.scala‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎app/services/ExtractionService.scala‎
Lines changed: 2 additions & 0 deletions b/‎app/services/ExtractionService.scala‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎app/services/FileService.scala‎
Lines changed: 1 addition & 1 deletion b/‎app/services/FileService.scala‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎app/services/mongodb/MongoDBCollectionService.scala‎
Lines changed: 1 addition & 1 deletion b/‎app/services/mongodb/MongoDBCollectionService.scala‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎app/services/mongodb/MongoDBDatasetService.scala‎
Lines changed: 13 additions & 3 deletions b/‎app/services/mongodb/MongoDBDatasetService.scala‎
Lines changed: 13 additions & 3 deletions
diff --git a/‎app/services/mongodb/MongoDBExtractionService.scala‎
Lines changed: 12 additions & 1 deletion b/‎app/services/mongodb/MongoDBExtractionService.scala‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎app/services/mongodb/MongoDBFileService.scala‎
Lines changed: 5 additions & 2 deletions b/‎app/services/mongodb/MongoDBFileService.scala‎
Lines changed: 5 additions & 2 deletions
@@ -4,6 +4,16 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/)
 and this project adheres to [Semantic Versioning](http://semver.org/).
 
+## 1.14.0 - 2021-01-07
+
+### Added
+- Added a new `/api/reports/metrics/extractors` report for summarizing extractor usage by user. Database administrators
+  can use `scripts/updates/UpdateUserId.js` to assign user IDs to older extraction event records based on resource ownership
+  in order to improve the accuracy of the report for older data.
+
+### Changed
+- `api/reports/storage/spaces` endpoint now accepts a space parameter for ID rather than requiring a space filter.
+- Datasets and collections in the trash are no longer indexed for discovery in search services.
 
 ## 1.13.0 - 2020-12-02
 
 
@@ -11,10 +11,11 @@ import javax.inject.Inject
 import java.util.{Date, TimeZone}
 
 import services._
-import models.{Collection, Dataset, File, ProjectSpace, UUID, User, UserStatus}
-import util.Parsers
+import models.{Collection, Dataset, File, ProjectSpace, UUID, User, UserStatus, ExtractionJob}
 
-import scala.collection.mutable.ListBuffer
+import org.apache.commons.lang3.Range.between
+import scala.collection.mutable.{ListBuffer, Map => MutaMap}
+import util.Parsers
 
 
 /**
@@ -25,7 +26,8 @@ class Reporting @Inject()(selections: SelectionService,
                           files: FileService,
                           collections: CollectionService,
                           spaces: SpaceService,
-                          users: UserService) extends Controller with ApiController {
+                          users: UserService,
+                          extractions: ExtractionService) extends Controller with ApiController {
 
   val dateFormat = new java.text.SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'")
   dateFormat.setTimeZone(TimeZone.getTimeZone("GMT"))
@@ -394,9 +396,9 @@ class Reporting @Inject()(selections: SelectionService,
     return contents
   }
 
-  def spaceStorage(id: UUID, since: Option[String], until: Option[String]) = ServerAdminAction { implicit request =>
+  def spaceStorage(space: Option[String], since: Option[String], until: Option[String]) = ServerAdminAction { implicit request =>
     // Iterate over the files of every dataset in the space
-    val results = datasets.getIterator(Some(id), None, None) // TODO: Can't use time filters here if user intends files
+    val results = datasets.getIterator(space, None, None) // TODO: Can't use time filters here if user intends files
 
     var headerRow = true
     val enum = Enumerator.generateM({
@@ -495,9 +497,174 @@ class Reporting @Inject()(selections: SelectionService,
       Future(chunk)
     })
 
+    val filename = space match {
+      case Some(spid) => "SpaceStorage_"+spid+".csv"
+      case None => "SpaceStorage.csv"
+    }
+    Ok.chunked(enum.andThen(Enumerator.eof)).withHeaders(
+      "Content-Type" -> "text/csv",
+      "Content-Disposition" -> ("attachment; filename="+filename)
+    )
+  }
+
+  private def determineJobType(jobMsg: String): String = {
+    if (jobMsg == "SUBMITTED")
+      "queue"
+    else
+      "work" // TODO: Better solution?
+  }
+
+  def extractorUsage(since: Option[String], until: Option[String]) = ServerAdminAction { implicit request =>
+    Logger.debug("Generating extraction metrics report")
+
+    /** This mapping is used to aggregate jobs.
+     * A job is considered some countable extraction duration. It has a jobType so
+     * we can attempt to differentiate "time in queue" from "time being processed".
+     *
+     * jobLookup: [
+     *  UserID -> [
+     *    UniqueJobKey -> {
+     *      jobs: [ list of jobs identical to current_job below ]
+     *      current_job: {
+     *        target      event.file_id (but can be a dataset ID or metadata ID in reality)
+     *        targetType  file/dataset/metadata
+     *        extractor   extractor id (e.g. ncsa.file.digest)
+     *        spaceId     id of space containing target
+     *        jobId       official job_id, if available
+     *        jobType     is this a queue event or an actual work event on a node? see determineJobType()
+     *        lastStatus  most recent event.status for the job
+     *        start       earliest event.start time from events in this job (event.end is often blank)
+     *        end         latest event.start time from events in this job (event.end is often blank)
+     *
+     *      }
+     *    }
+     */
+    val jobLookup: MutaMap[UUID,
+      MutaMap[String, (List[ExtractionJob], Option[ExtractionJob])]] = MutaMap.empty
+
+    val results = extractions.getIterator(true, since, until, None)
+    while (results.hasNext) {
+      val event = results.next
+
+      // Collect info to associate this event with a job if possible
+      val jobId = event.job_id match {
+        case Some(jid) => jid.stringify
+        case None => ""
+      }
+      val jobType = determineJobType(event.status)
+      val uniqueKey = event.file_id + " - " + event.extractor_id
+
+      // Add user and uniqueKey if they don't exist yet
+      if (!jobLookup.get(event.user_id).isDefined)
+        jobLookup(event.user_id) = MutaMap.empty
+      if (!jobLookup.get(event.user_id).get.get(uniqueKey).isDefined)
+        jobLookup(event.user_id)(uniqueKey) = (List.empty, None)
+
+      // If we don't have an ongoing job, or it's not same jobType, start a new ongoing job
+      var jobList    = jobLookup(event.user_id)(uniqueKey)._1
+      val currentJob = jobLookup(event.user_id)(uniqueKey)._2
+      val newJobBeginning = currentJob match {
+        case Some(currJob) => currJob.jobType != jobType
+        case None => true
+      }
+
+      if (newJobBeginning) {
+        // Determine parent details for new job - quick dataset check first, then file search
+        var spaces = ""
+        var resourceType = "file"
+        val parentDatasets = datasets.findByFileIdAllContain(event.file_id)
+        if (parentDatasets.length > 0) {
+          parentDatasets.foreach(ds => {
+            spaces = ds.spaces.mkString(",")
+            resourceType = "file"
+          })
+        } else {
+          datasets.get(event.file_id) match {
+            case Some(ds) => {
+              spaces = ds.spaces.mkString(",")
+              resourceType = "dataset"
+            }
+            case None => {}
+          }
+        }
+
+        // Push current job to jobs list (saying it ended at start of next stage) and make new job entry
+        if (currentJob.isDefined) {
+          jobList = jobList ::: List(currentJob.get.copy(end=event.start))
+        }
+        val newJob = ExtractionJob(event.file_id.stringify, resourceType, event.extractor_id, spaces, jobId, jobType, 1,
+          event.status, event.start, event.start)
+        jobLookup(event.user_id)(uniqueKey) = (jobList, Some(newJob))
+      } else {
+        // Don't overwrite DONE as final message in case we have small differences in timing of last extractor msg
+        var status = currentJob.get.lastStatus
+        if (status != "DONE") status = event.status
+        val updatedJob = currentJob.get.copy(statusCount=currentJob.get.statusCount+1, lastStatus=event.status, end=event.start)
+        jobLookup(event.user_id)(uniqueKey) = (jobList, Some(updatedJob))
+      }
+    }
+
+    var headerRow = true
+    val keyiter = jobLookup.keysIterator
+    val enum = Enumerator.generateM({
+      val chunk = if (headerRow) {
+        val headers = List("userid", "username", "email", "resource_id", "resource_type", "space_id", "extractor",
+          "job_id", "job_type", "status_count", "last_status", "start", "end", "duration_ms")
+        val header = "\""+headers.mkString("\",\"")+"\"\n"
+        headerRow = false
+        Some(header.getBytes("UTF-8"))
+      } else {
+        scala.concurrent.blocking {
+          if (keyiter.hasNext) {
+            val userid = keyiter.next
+
+            // Get pretty user info
+            var username = ""
+            var email = ""
+            users.get(userid) match {
+              case Some(u) => {
+                username = u.fullName
+                email = u.email.getOrElse("")
+              }
+              case None => {}
+            }
+
+            var content = ""
+            val userRecords = jobLookup(userid)
+            userRecords.keysIterator.foreach(jobkey => {
+              val jobHistory = userRecords(jobkey)
+              val jobList = jobHistory._1
+              val currJob = jobHistory._2
+              jobList.foreach(job => {
+                val duration = (job.end.getTime - job.start.getTime)
+                val row = List(userid.stringify, username, email, job.target, job.targetType, job.spaces, job.extractor,
+                  job.jobId, job.jobType, job.statusCount, job.lastStatus, job.start, job.end, duration)
+                if (duration > 0)
+                  content += "\""+row.mkString("\",\"")+"\"\n"
+              })
+              // current job if it was never "closed" and pushed to the jobList (most common case)
+              currJob match {
+                case Some(job) => {
+                  val duration = (job.end.getTime - job.start.getTime)
+                  val row = List(userid.stringify, username, email, job.target, job.targetType, job.spaces, job.extractor,
+                    job.jobId, job.jobType, job.statusCount, job.lastStatus, job.start, job.end, duration)
+                  if (duration > 0)
+                    content += "\""+row.mkString("\",\"")+"\"\n"
+                }
+                case None => {}
+              }
+            })
+            Some(content.getBytes("UTF-8"))
+          }
+          else None
+        }
+      }
+      Future(chunk)
+    })
+
     Ok.chunked(enum.andThen(Enumerator.eof)).withHeaders(
       "Content-Type" -> "text/csv",
-      "Content-Disposition" -> ("attachment; filename=SpaceStorage"+id.stringify+".csv")
+      "Content-Disposition" -> "attachment; filename=ExtractorMetrics.csv"
     )
   }
 }
@@ -23,6 +23,20 @@ case class Extraction(
   user_id: UUID = User.anonymous.id
 )
 
+// Used in extraction report aggregation
+case class ExtractionJob(
+  target: String,
+  targetType: String,
+  extractor: String,
+  spaces: String,
+  jobId: String,
+  jobType: String,
+  statusCount: Int,
+  lastStatus: String,
+  start: Date,
+  end: Date
+)
+
 /**
  * Currently running extractor name
  */
 
@@ -383,6 +383,7 @@ trait DatasetService {
 
   def incrementDownloads(id: UUID, user: Option[User])
 
-  def getIterator(space: Option[UUID], since: Option[String], until: Option[String]): Iterator[Dataset]
+  def getIterator(space: Option[String], since: Option[String], until: Option[String]): Iterator[Dataset]
 
+  def getTrashedIds(): List[UUID]
 }
@@ -18,6 +18,8 @@ trait ExtractionService {
 
   def get(msgId: UUID): Option[Extraction]
 
+  def getIterator(userRequired: Boolean, since: Option[String], until: Option[String], user: Option[UUID]): Iterator[Extraction]
+
   def findById(resource: ResourceRef): List[Extraction]
 
   def findByExtractorIDBefore(extractorID: String, status: String, date: String, limit: Int): List[Extraction]
 
@@ -246,6 +246,6 @@ trait FileService {
 
   def incrementDownloads(id: UUID, user: Option[User])
 
-  def getIterator(space: Option[UUID], since: Option[String], until: Option[String]): Iterator[File]
+  def getIterator(space: Option[String], since: Option[String], until: Option[String]): Iterator[File]
 
 }
@@ -894,7 +894,7 @@ class MongoDBCollectionService @Inject() (
 
   def indexAll(idx: Option[String] = None) = {
     // Bypass Salat in case any of the file records are malformed to continue past them
-    Collection.dao.collection.find(MongoDBObject(), MongoDBObject("_id" -> 1)).foreach(c => {
+    Collection.dao.collection.find(MongoDBObject("trash" -> false), MongoDBObject("_id" -> 1)).foreach(c => {
       index(new UUID(c.get("_id").toString), idx)
     })
   }
 
@@ -1412,7 +1412,7 @@ class MongoDBDatasetService @Inject() (
 
   def indexAll(idx: Option[String] = None) = {
     // Bypass Salat in case any of the file records are malformed to continue past them
-    Dataset.dao.collection.find(MongoDBObject(), MongoDBObject("_id" -> 1)).foreach(d => {
+    Dataset.dao.collection.find(MongoDBObject("trash" -> false), MongoDBObject("_id" -> 1)).foreach(d => {
       index(new UUID(d.get("_id").toString), idx)
     })
   }
@@ -1641,13 +1641,23 @@ class MongoDBDatasetService @Inject() (
    * @param since - include only datasets created after a certain date
    * @param until - include only datasets created before a certain date
    */
-  def getIterator(space: Option[UUID], since: Option[String], until: Option[String]): Iterator[Dataset] = {
+  def getIterator(space: Option[String], since: Option[String], until: Option[String]): Iterator[Dataset] = {
     var query = MongoDBObject("trash" -> false)
-    space.foreach(spid => query += ("spaces" -> new ObjectId(spid.stringify)))
+    space.foreach(spid => query += ("spaces" -> new ObjectId(spid)))
     since.foreach(t => query = query ++ ("created" $gte Parsers.fromISO8601(t)))
     until.foreach(t => query = query ++ ("created" $lte Parsers.fromISO8601(t)))
     Dataset.find(query)
   }
+
+  // Get a list of all trashed dataset and file ids for comparison
+  def getTrashedIds(): List[UUID] = {
+    val trashedIds = ListBuffer[UUID]()
+    Dataset.find(MongoDBObject("trash" -> true)).map(ds => {
+      ds.files.foreach(fid => trashedIds += fid)
+      trashedIds += ds.id
+    })
+    trashedIds.toList
+  }
 }
 
 object Dataset extends ModelCompanion[Dataset, ObjectId] {
 
@@ -3,17 +3,19 @@ package services.mongodb
 import java.text.SimpleDateFormat
 
 import services.ExtractionService
-import models.{UUID, Extraction, ExtractionGroup, ResourceRef}
+import models.{Extraction, ExtractionGroup, ResourceRef, UUID}
 import org.bson.types.ObjectId
 import play.api.Play.current
 import com.novus.salat.dao.ModelCompanion
 import com.novus.salat.dao.SalatDAO
 import MongoContext.context
 import com.mongodb.casbah.commons.MongoDBObject
 import java.util.Date
+
 import play.api.Logger
 import models.WebPageResource
 import com.mongodb.casbah.Imports._
+import util.Parsers
 
 /**
  * Use MongoDB to store extractions
@@ -37,6 +39,15 @@ class MongoDBExtractionService extends ExtractionService {
     Extraction.findOne(MongoDBObject("id" -> new ObjectId(msgId.stringify)))
   }
 
+  def getIterator(userRequired: Boolean, since: Option[String], until: Option[String], user: Option[UUID]): Iterator[Extraction] = {
+    var query = MongoDBObject()
+    if (userRequired) query = query ++ ("user_id" $exists true)
+    since.foreach(t => query = query ++ ("start" $gte Parsers.fromISO8601(t)))
+    until.foreach(t => query = query ++ ("start" $lte Parsers.fromISO8601(t)))
+    user.foreach(uid => query = query ++ ("user_id" -> new ObjectId(uid.stringify)))
+    Extraction.find(query).toIterator
+  }
+
   def findById(resource: ResourceRef): List[Extraction] = {
     Extraction.find(MongoDBObject("file_id" -> new ObjectId(resource.id.stringify))).toList
   }
 
@@ -326,8 +326,11 @@ class MongoDBFileService @Inject() (
 
   def indexAll(idx: Option[String] = None) = {
     // Bypass Salat in case any of the file records are malformed to continue past them
+    val trashedIds = datasets.getTrashedIds()
     FileDAO.dao.collection.find(MongoDBObject(), MongoDBObject("_id" -> 1)).foreach(f => {
-      index(new UUID(f.get("_id").toString), idx)
+      val fid = new UUID(f.get("_id").toString)
+      if (!trashedIds.contains(fid))
+        index(fid, idx)
     })
   }
 
@@ -1215,7 +1218,7 @@ class MongoDBFileService @Inject() (
     }
   }
 
-  def getIterator(space: Option[UUID], since: Option[String], until: Option[String]): Iterator[File] = {
+  def getIterator(space: Option[String], since: Option[String], until: Option[String]): Iterator[File] = {
     var query = MongoDBObject()
     space.foreach(spid => {
       // If space is specified, we have to get that association from datasets for now
Original file line number	Diff line number	Diff line change
`@@ -383,6 +383,7 @@ trait DatasetService {`
`383`	`383`
`384`	`384`	`def incrementDownloads(id: UUID, user: Option[User])`
`385`	`385`
`386`		`- def getIterator(space: Option[UUID], since: Option[String], until: Option[String]): Iterator[Dataset]`
	`386`	`+ def getIterator(space: Option[String], since: Option[String], until: Option[String]): Iterator[Dataset]`
`387`	`387`
	`388`	`+ def getTrashedIds(): List[UUID]`
`388`	`389`	`}`
Original file line number	Diff line number	Diff line change
`@@ -246,6 +246,6 @@ trait FileService {`
`246`	`246`
`247`	`247`	`def incrementDownloads(id: UUID, user: Option[User])`
`248`	`248`
`249`		`- def getIterator(space: Option[UUID], since: Option[String], until: Option[String]): Iterator[File]`
	`249`	`+ def getIterator(space: Option[String], since: Option[String], until: Option[String]): Iterator[File]`
`250`	`250`
`251`	`251`	`}`
Original file line number	Diff line number	Diff line change
`@@ -894,7 +894,7 @@ class MongoDBCollectionService @Inject() (`
`894`	`894`
`895`	`895`	`def indexAll(idx: Option[String] = None) = {`
`896`	`896`	`// Bypass Salat in case any of the file records are malformed to continue past them`
`897`		`- Collection.dao.collection.find(MongoDBObject(), MongoDBObject("_id" -> 1)).foreach(c => {`
	`897`	`+ Collection.dao.collection.find(MongoDBObject("trash" -> false), MongoDBObject("_id" -> 1)).foreach(c => {`
`898`	`898`	`index(new UUID(c.get("_id").toString), idx)`
`899`	`899`	`})`
`900`	`900`	`}`
Original file line number	Diff line number	Diff line change
`@@ -326,8 +326,11 @@ class MongoDBFileService @Inject() (`
`326`	`326`
`327`	`327`	`def indexAll(idx: Option[String] = None) = {`
`328`	`328`	`// Bypass Salat in case any of the file records are malformed to continue past them`
	`329`	`+ val trashedIds = datasets.getTrashedIds()`
`329`	`330`	`FileDAO.dao.collection.find(MongoDBObject(), MongoDBObject("_id" -> 1)).foreach(f => {`
`330`		`- index(new UUID(f.get("_id").toString), idx)`
	`331`	`+ val fid = new UUID(f.get("_id").toString)`
	`332`	`+ if (!trashedIds.contains(fid))`
	`333`	`+ index(fid, idx)`
`331`	`334`	`})`
`332`	`335`	`}`
`333`	`336`
`@@ -1215,7 +1218,7 @@ class MongoDBFileService @Inject() (`
`1215`	`1218`	`}`
`1216`	`1219`	`}`
`1217`	`1220`
`1218`		`- def getIterator(space: Option[UUID], since: Option[String], until: Option[String]): Iterator[File] = {`
	`1221`	`+ def getIterator(space: Option[String], since: Option[String], until: Option[String]): Iterator[File] = {`
`1219`	`1222`	`var query = MongoDBObject()`
`1220`	`1223`	`space.foreach(spid => {`
`1221`	`1224`	`// If space is specified, we have to get that association from datasets for now`