Skip to content

Commit 4f92b61

Browse files
authored
initial implementation of extractor usage report (#124)
* initial implementation of report * Update CHANGELOG.md * stub for db update script * Update MongoSalatPlugin.scala * simplify user lookup based on db update * Updates for parity with script * logic improvements * Clean up output formatting * Add duration to queue jobs * User ID script fix dataset IDs, print counts in script
1 parent dc10a49 commit 4f92b61

File tree

9 files changed

+377
-8
lines changed

9 files changed

+377
-8
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
44
The format is based on [Keep a Changelog](http://keepachangelog.com/)
55
and this project adheres to [Semantic Versioning](http://semver.org/).
66

7+
## Unreleased
8+
9+
### Added
10+
- Added a new `/api/reports/metrics/extractors` report for summarizing extractor usage by user.
711

812
## 1.13.0 - 2020-12-02
913

@@ -33,6 +37,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
3337
### Changed
3438
- Docker Images are now pushed to [github container registry](https://github.com/orgs/clowder-framework/packages)
3539

40+
3641
## 1.12.0 - 2020-10-19
3742
**_Warning:_**
3843
- This update modifies the MongoDB schema. Make sure to start the application with `-DMONGOUPDATE=1`.

app/api/Reporting.scala

Lines changed: 167 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,11 @@ import javax.inject.Inject
1111
import java.util.{Date, TimeZone}
1212

1313
import services._
14-
import models.{Collection, Dataset, File, ProjectSpace, UUID, User, UserStatus}
15-
import util.Parsers
14+
import models.{Collection, Dataset, File, ProjectSpace, UUID, User, UserStatus, ExtractionJob}
1615

17-
import scala.collection.mutable.ListBuffer
16+
import org.apache.commons.lang3.Range.between
17+
import scala.collection.mutable.{ListBuffer, Map => MutaMap}
18+
import util.Parsers
1819

1920

2021
/**
@@ -25,7 +26,8 @@ class Reporting @Inject()(selections: SelectionService,
2526
files: FileService,
2627
collections: CollectionService,
2728
spaces: SpaceService,
28-
users: UserService) extends Controller with ApiController {
29+
users: UserService,
30+
extractions: ExtractionService) extends Controller with ApiController {
2931

3032
val dateFormat = new java.text.SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'")
3133
dateFormat.setTimeZone(TimeZone.getTimeZone("GMT"))
@@ -500,4 +502,165 @@ class Reporting @Inject()(selections: SelectionService,
500502
"Content-Disposition" -> ("attachment; filename=SpaceStorage"+id.stringify+".csv")
501503
)
502504
}
505+
506+
private def determineJobType(jobMsg: String): String = {
507+
if (jobMsg == "SUBMITTED")
508+
"queue"
509+
else
510+
"work" // TODO: Better solution?
511+
}
512+
513+
def extractorUsage(since: Option[String], until: Option[String]) = ServerAdminAction { implicit request =>
514+
Logger.debug("Generating extraction metrics report")
515+
516+
/** This mapping is used to aggregate jobs.
517+
* A job is considered some countable extraction duration. It has a jobType so
518+
* we can attempt to differentiate "time in queue" from "time being processed".
519+
*
520+
* jobLookup: [
521+
* UserID -> [
522+
* UniqueJobKey -> {
523+
* jobs: [ list of jobs identical to current_job below ]
524+
* current_job: {
525+
* target event.file_id (but can be a dataset ID or metadata ID in reality)
526+
* targetType file/dataset/metadata
527+
* extractor extractor id (e.g. ncsa.file.digest)
528+
* spaceId id of space containing target
529+
* jobId official job_id, if available
530+
* jobType is this a queue event or an actual work event on a node? see determineJobType()
531+
* lastStatus most recent event.status for the job
532+
* start earliest event.start time from events in this job (event.end is often blank)
533+
* end latest event.start time from events in this job (event.end is often blank)
534+
*
535+
* }
536+
* }
537+
*/
538+
val jobLookup: MutaMap[UUID,
539+
MutaMap[String, (List[ExtractionJob], Option[ExtractionJob])]] = MutaMap.empty
540+
541+
val results = extractions.getIterator(true, since, until, None)
542+
while (results.hasNext) {
543+
val event = results.next
544+
545+
// Collect info to associate this event with a job if possible
546+
val jobId = event.job_id match {
547+
case Some(jid) => jid.stringify
548+
case None => ""
549+
}
550+
val jobType = determineJobType(event.status)
551+
val uniqueKey = event.file_id + " - " + event.extractor_id
552+
553+
// Add user and uniqueKey if they don't exist yet
554+
if (!jobLookup.get(event.user_id).isDefined)
555+
jobLookup(event.user_id) = MutaMap.empty
556+
if (!jobLookup.get(event.user_id).get.get(uniqueKey).isDefined)
557+
jobLookup(event.user_id)(uniqueKey) = (List.empty, None)
558+
559+
// If we don't have an ongoing job, or it's not same jobType, start a new ongoing job
560+
var jobList = jobLookup(event.user_id)(uniqueKey)._1
561+
val currentJob = jobLookup(event.user_id)(uniqueKey)._2
562+
val newJobBeginning = currentJob match {
563+
case Some(currJob) => currJob.jobType != jobType
564+
case None => true
565+
}
566+
567+
if (newJobBeginning) {
568+
// Determine parent details for new job - quick dataset check first, then file search
569+
var spaces = ""
570+
var resourceType = "file"
571+
val parentDatasets = datasets.findByFileIdAllContain(event.file_id)
572+
if (parentDatasets.length > 0) {
573+
parentDatasets.foreach(ds => {
574+
spaces = ds.spaces.mkString(",")
575+
resourceType = "file"
576+
})
577+
} else {
578+
datasets.get(event.file_id) match {
579+
case Some(ds) => {
580+
spaces = ds.spaces.mkString(",")
581+
resourceType = "dataset"
582+
}
583+
case None => {}
584+
}
585+
}
586+
587+
// Push current job to jobs list (saying it ended at start of next stage) and make new job entry
588+
if (currentJob.isDefined) {
589+
jobList = jobList ::: List(currentJob.get.copy(end=event.start))
590+
}
591+
val newJob = ExtractionJob(event.file_id.stringify, resourceType, event.extractor_id, spaces, jobId, jobType, 1,
592+
event.status, event.start, event.start)
593+
jobLookup(event.user_id)(uniqueKey) = (jobList, Some(newJob))
594+
} else {
595+
// Don't overwrite DONE as final message in case we have small differences in timing of last extractor msg
596+
var status = currentJob.get.lastStatus
597+
if (status != "DONE") status = event.status
598+
val updatedJob = currentJob.get.copy(statusCount=currentJob.get.statusCount+1, lastStatus=event.status, end=event.start)
599+
jobLookup(event.user_id)(uniqueKey) = (jobList, Some(updatedJob))
600+
}
601+
}
602+
603+
var headerRow = true
604+
val keyiter = jobLookup.keysIterator
605+
val enum = Enumerator.generateM({
606+
val chunk = if (headerRow) {
607+
val headers = List("userid", "username", "email", "resource_id", "resource_type", "space_id", "extractor",
608+
"job_id", "job_type", "status_count", "last_status", "start", "end", "duration_ms")
609+
val header = "\""+headers.mkString("\",\"")+"\"\n"
610+
headerRow = false
611+
Some(header.getBytes("UTF-8"))
612+
} else {
613+
scala.concurrent.blocking {
614+
if (keyiter.hasNext) {
615+
val userid = keyiter.next
616+
617+
// Get pretty user info
618+
var username = ""
619+
var email = ""
620+
users.get(userid) match {
621+
case Some(u) => {
622+
username = u.fullName
623+
email = u.email.getOrElse("")
624+
}
625+
case None => {}
626+
}
627+
628+
var content = ""
629+
val userRecords = jobLookup(userid)
630+
userRecords.keysIterator.foreach(jobkey => {
631+
val jobHistory = userRecords(jobkey)
632+
val jobList = jobHistory._1
633+
val currJob = jobHistory._2
634+
jobList.foreach(job => {
635+
val duration = (job.end.getTime - job.start.getTime)
636+
val row = List(userid.stringify, username, email, job.target, job.targetType, job.spaces, job.extractor,
637+
job.jobId, job.jobType, job.statusCount, job.lastStatus, job.start, job.end, duration)
638+
if (duration > 0)
639+
content += "\""+row.mkString("\",\"")+"\"\n"
640+
})
641+
// current job if it was never "closed" and pushed to the jobList (most common case)
642+
currJob match {
643+
case Some(job) => {
644+
val duration = (job.end.getTime - job.start.getTime)
645+
val row = List(userid.stringify, username, email, job.target, job.targetType, job.spaces, job.extractor,
646+
job.jobId, job.jobType, job.statusCount, job.lastStatus, job.start, job.end, duration)
647+
if (duration > 0)
648+
content += "\""+row.mkString("\",\"")+"\"\n"
649+
}
650+
case None => {}
651+
}
652+
})
653+
Some(content.getBytes("UTF-8"))
654+
}
655+
else None
656+
}
657+
}
658+
Future(chunk)
659+
})
660+
661+
Ok.chunked(enum.andThen(Enumerator.eof)).withHeaders(
662+
"Content-Type" -> "text/csv",
663+
"Content-Disposition" -> "attachment; filename=ExtractorMetrics.csv"
664+
)
665+
}
503666
}

app/models/Extraction.scala

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,20 @@ case class Extraction(
2323
user_id: UUID = User.anonymous.id
2424
)
2525

26+
// Used in extraction report aggregation
27+
case class ExtractionJob(
28+
target: String,
29+
targetType: String,
30+
extractor: String,
31+
spaces: String,
32+
jobId: String,
33+
jobType: String,
34+
statusCount: Int,
35+
lastStatus: String,
36+
start: Date,
37+
end: Date
38+
)
39+
2640
/**
2741
* Currently running extractor name
2842
*/

app/services/ExtractionService.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ trait ExtractionService {
1818

1919
def get(msgId: UUID): Option[Extraction]
2020

21+
def getIterator(userRequired: Boolean, since: Option[String], until: Option[String], user: Option[UUID]): Iterator[Extraction]
22+
2123
def findById(resource: ResourceRef): List[Extraction]
2224

2325
def findByExtractorIDBefore(extractorID: String, status: String, date: String, limit: Int): List[Extraction]

app/services/mongodb/MongoDBExtractionService.scala

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,19 @@ package services.mongodb
33
import java.text.SimpleDateFormat
44

55
import services.ExtractionService
6-
import models.{UUID, Extraction, ExtractionGroup, ResourceRef}
6+
import models.{Extraction, ExtractionGroup, ResourceRef, UUID}
77
import org.bson.types.ObjectId
88
import play.api.Play.current
99
import com.novus.salat.dao.ModelCompanion
1010
import com.novus.salat.dao.SalatDAO
1111
import MongoContext.context
1212
import com.mongodb.casbah.commons.MongoDBObject
1313
import java.util.Date
14+
1415
import play.api.Logger
1516
import models.WebPageResource
1617
import com.mongodb.casbah.Imports._
18+
import util.Parsers
1719

1820
/**
1921
* Use MongoDB to store extractions
@@ -37,6 +39,15 @@ class MongoDBExtractionService extends ExtractionService {
3739
Extraction.findOne(MongoDBObject("id" -> new ObjectId(msgId.stringify)))
3840
}
3941

42+
def getIterator(userRequired: Boolean, since: Option[String], until: Option[String], user: Option[UUID]): Iterator[Extraction] = {
43+
var query = MongoDBObject()
44+
if (userRequired) query = query ++ ("user_id" $exists true)
45+
since.foreach(t => query = query ++ ("start" $gte Parsers.fromISO8601(t)))
46+
until.foreach(t => query = query ++ ("start" $lte Parsers.fromISO8601(t)))
47+
user.foreach(uid => query = query ++ ("user_id" -> new ObjectId(uid.stringify)))
48+
Extraction.find(query).toIterator
49+
}
50+
4051
def findById(resource: ResourceRef): List[Extraction] = {
4152
Extraction.find(MongoDBObject("file_id" -> new ObjectId(resource.id.stringify))).toList
4253
}

app/services/mongodb/MongoSalatPlugin.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import services.filesystem.DiskByteStorageService
2626
import services.{AppConfigurationService, ByteStorageService, DI, MetadataService}
2727

2828
import scala.collection.JavaConverters._
29+
import scala.collection.mutable.{Map => MutaMap}
2930

3031
/**
3132
* Mongo Salat service.
@@ -1685,5 +1686,4 @@ class MongoSalatPlugin(app: Application) extends Plugin {
16851686
}
16861687
print("DONE")
16871688
}
1688-
16891689
}

conf/routes

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -757,6 +757,8 @@ GET /api/reports/metrics/collections
757757
GET /api/reports/metrics/spaces @api.Reporting.spaceMetrics()
758758
GET /api/reports/metrics/users @api.Reporting.userMetrics()
759759
GET /api/reports/storage/spaces/:id @api.Reporting.spaceStorage(id: UUID, since: Option[String] ?= None, until: Option[String] ?= None)
760+
GET /api/reports/metrics/extractors @api.Reporting.extractorUsage(since: Option[String] ?= None, until: Option[String] ?= None)
761+
760762

761763
# ----------------------------------------------------------------------
762764
# MISC./OTHER ENDPOINTS

0 commit comments

Comments
 (0)