Skip to content

Commit 7a0c181

Browse files
authored
Merge pull request #154 from clowder-framework/release/1.14
Release/1.14
2 parents dc10a49 + dc011d8 commit 7a0c181

17 files changed

+422
-30
lines changed

CHANGELOG.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,16 @@ All notable changes to this project will be documented in this file.
44
The format is based on [Keep a Changelog](http://keepachangelog.com/)
55
and this project adheres to [Semantic Versioning](http://semver.org/).
66

7+
## 1.14.0 - 2021-01-07
8+
9+
### Added
10+
- Added a new `/api/reports/metrics/extractors` report for summarizing extractor usage by user. Database administrators
11+
can use `scripts/updates/UpdateUserId.js` to assign user IDs to older extraction event records based on resource ownership
12+
in order to improve the accuracy of the report for older data.
13+
14+
### Changed
15+
- `api/reports/storage/spaces` endpoint now accepts a space parameter for ID rather than requiring a space filter.
16+
- Datasets and collections in the trash are no longer indexed for discovery in search services.
717

818
## 1.13.0 - 2020-12-02
919

app/api/Reporting.scala

Lines changed: 174 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,11 @@ import javax.inject.Inject
1111
import java.util.{Date, TimeZone}
1212

1313
import services._
14-
import models.{Collection, Dataset, File, ProjectSpace, UUID, User, UserStatus}
15-
import util.Parsers
14+
import models.{Collection, Dataset, File, ProjectSpace, UUID, User, UserStatus, ExtractionJob}
1615

17-
import scala.collection.mutable.ListBuffer
16+
import org.apache.commons.lang3.Range.between
17+
import scala.collection.mutable.{ListBuffer, Map => MutaMap}
18+
import util.Parsers
1819

1920

2021
/**
@@ -25,7 +26,8 @@ class Reporting @Inject()(selections: SelectionService,
2526
files: FileService,
2627
collections: CollectionService,
2728
spaces: SpaceService,
28-
users: UserService) extends Controller with ApiController {
29+
users: UserService,
30+
extractions: ExtractionService) extends Controller with ApiController {
2931

3032
val dateFormat = new java.text.SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'")
3133
dateFormat.setTimeZone(TimeZone.getTimeZone("GMT"))
@@ -394,9 +396,9 @@ class Reporting @Inject()(selections: SelectionService,
394396
return contents
395397
}
396398

397-
def spaceStorage(id: UUID, since: Option[String], until: Option[String]) = ServerAdminAction { implicit request =>
399+
def spaceStorage(space: Option[String], since: Option[String], until: Option[String]) = ServerAdminAction { implicit request =>
398400
// Iterate over the files of every dataset in the space
399-
val results = datasets.getIterator(Some(id), None, None) // TODO: Can't use time filters here if user intends files
401+
val results = datasets.getIterator(space, None, None) // TODO: Can't use time filters here if user intends files
400402

401403
var headerRow = true
402404
val enum = Enumerator.generateM({
@@ -495,9 +497,174 @@ class Reporting @Inject()(selections: SelectionService,
495497
Future(chunk)
496498
})
497499

500+
val filename = space match {
501+
case Some(spid) => "SpaceStorage_"+spid+".csv"
502+
case None => "SpaceStorage.csv"
503+
}
504+
Ok.chunked(enum.andThen(Enumerator.eof)).withHeaders(
505+
"Content-Type" -> "text/csv",
506+
"Content-Disposition" -> ("attachment; filename="+filename)
507+
)
508+
}
509+
510+
private def determineJobType(jobMsg: String): String = {
511+
if (jobMsg == "SUBMITTED")
512+
"queue"
513+
else
514+
"work" // TODO: Better solution?
515+
}
516+
517+
def extractorUsage(since: Option[String], until: Option[String]) = ServerAdminAction { implicit request =>
518+
Logger.debug("Generating extraction metrics report")
519+
520+
/** This mapping is used to aggregate jobs.
521+
* A job is considered some countable extraction duration. It has a jobType so
522+
* we can attempt to differentiate "time in queue" from "time being processed".
523+
*
524+
* jobLookup: [
525+
* UserID -> [
526+
* UniqueJobKey -> {
527+
* jobs: [ list of jobs identical to current_job below ]
528+
* current_job: {
529+
* target event.file_id (but can be a dataset ID or metadata ID in reality)
530+
* targetType file/dataset/metadata
531+
* extractor extractor id (e.g. ncsa.file.digest)
532+
* spaceId id of space containing target
533+
* jobId official job_id, if available
534+
* jobType is this a queue event or an actual work event on a node? see determineJobType()
535+
* lastStatus most recent event.status for the job
536+
* start earliest event.start time from events in this job (event.end is often blank)
537+
* end latest event.start time from events in this job (event.end is often blank)
538+
*
539+
* }
540+
* }
541+
*/
542+
val jobLookup: MutaMap[UUID,
543+
MutaMap[String, (List[ExtractionJob], Option[ExtractionJob])]] = MutaMap.empty
544+
545+
val results = extractions.getIterator(true, since, until, None)
546+
while (results.hasNext) {
547+
val event = results.next
548+
549+
// Collect info to associate this event with a job if possible
550+
val jobId = event.job_id match {
551+
case Some(jid) => jid.stringify
552+
case None => ""
553+
}
554+
val jobType = determineJobType(event.status)
555+
val uniqueKey = event.file_id + " - " + event.extractor_id
556+
557+
// Add user and uniqueKey if they don't exist yet
558+
if (!jobLookup.get(event.user_id).isDefined)
559+
jobLookup(event.user_id) = MutaMap.empty
560+
if (!jobLookup.get(event.user_id).get.get(uniqueKey).isDefined)
561+
jobLookup(event.user_id)(uniqueKey) = (List.empty, None)
562+
563+
// If we don't have an ongoing job, or it's not same jobType, start a new ongoing job
564+
var jobList = jobLookup(event.user_id)(uniqueKey)._1
565+
val currentJob = jobLookup(event.user_id)(uniqueKey)._2
566+
val newJobBeginning = currentJob match {
567+
case Some(currJob) => currJob.jobType != jobType
568+
case None => true
569+
}
570+
571+
if (newJobBeginning) {
572+
// Determine parent details for new job - quick dataset check first, then file search
573+
var spaces = ""
574+
var resourceType = "file"
575+
val parentDatasets = datasets.findByFileIdAllContain(event.file_id)
576+
if (parentDatasets.length > 0) {
577+
parentDatasets.foreach(ds => {
578+
spaces = ds.spaces.mkString(",")
579+
resourceType = "file"
580+
})
581+
} else {
582+
datasets.get(event.file_id) match {
583+
case Some(ds) => {
584+
spaces = ds.spaces.mkString(",")
585+
resourceType = "dataset"
586+
}
587+
case None => {}
588+
}
589+
}
590+
591+
// Push current job to jobs list (saying it ended at start of next stage) and make new job entry
592+
if (currentJob.isDefined) {
593+
jobList = jobList ::: List(currentJob.get.copy(end=event.start))
594+
}
595+
val newJob = ExtractionJob(event.file_id.stringify, resourceType, event.extractor_id, spaces, jobId, jobType, 1,
596+
event.status, event.start, event.start)
597+
jobLookup(event.user_id)(uniqueKey) = (jobList, Some(newJob))
598+
} else {
599+
// Don't overwrite DONE as final message in case we have small differences in timing of last extractor msg
600+
var status = currentJob.get.lastStatus
601+
if (status != "DONE") status = event.status
602+
val updatedJob = currentJob.get.copy(statusCount=currentJob.get.statusCount+1, lastStatus=event.status, end=event.start)
603+
jobLookup(event.user_id)(uniqueKey) = (jobList, Some(updatedJob))
604+
}
605+
}
606+
607+
var headerRow = true
608+
val keyiter = jobLookup.keysIterator
609+
val enum = Enumerator.generateM({
610+
val chunk = if (headerRow) {
611+
val headers = List("userid", "username", "email", "resource_id", "resource_type", "space_id", "extractor",
612+
"job_id", "job_type", "status_count", "last_status", "start", "end", "duration_ms")
613+
val header = "\""+headers.mkString("\",\"")+"\"\n"
614+
headerRow = false
615+
Some(header.getBytes("UTF-8"))
616+
} else {
617+
scala.concurrent.blocking {
618+
if (keyiter.hasNext) {
619+
val userid = keyiter.next
620+
621+
// Get pretty user info
622+
var username = ""
623+
var email = ""
624+
users.get(userid) match {
625+
case Some(u) => {
626+
username = u.fullName
627+
email = u.email.getOrElse("")
628+
}
629+
case None => {}
630+
}
631+
632+
var content = ""
633+
val userRecords = jobLookup(userid)
634+
userRecords.keysIterator.foreach(jobkey => {
635+
val jobHistory = userRecords(jobkey)
636+
val jobList = jobHistory._1
637+
val currJob = jobHistory._2
638+
jobList.foreach(job => {
639+
val duration = (job.end.getTime - job.start.getTime)
640+
val row = List(userid.stringify, username, email, job.target, job.targetType, job.spaces, job.extractor,
641+
job.jobId, job.jobType, job.statusCount, job.lastStatus, job.start, job.end, duration)
642+
if (duration > 0)
643+
content += "\""+row.mkString("\",\"")+"\"\n"
644+
})
645+
// current job if it was never "closed" and pushed to the jobList (most common case)
646+
currJob match {
647+
case Some(job) => {
648+
val duration = (job.end.getTime - job.start.getTime)
649+
val row = List(userid.stringify, username, email, job.target, job.targetType, job.spaces, job.extractor,
650+
job.jobId, job.jobType, job.statusCount, job.lastStatus, job.start, job.end, duration)
651+
if (duration > 0)
652+
content += "\""+row.mkString("\",\"")+"\"\n"
653+
}
654+
case None => {}
655+
}
656+
})
657+
Some(content.getBytes("UTF-8"))
658+
}
659+
else None
660+
}
661+
}
662+
Future(chunk)
663+
})
664+
498665
Ok.chunked(enum.andThen(Enumerator.eof)).withHeaders(
499666
"Content-Type" -> "text/csv",
500-
"Content-Disposition" -> ("attachment; filename=SpaceStorage"+id.stringify+".csv")
667+
"Content-Disposition" -> "attachment; filename=ExtractorMetrics.csv"
501668
)
502669
}
503670
}

app/models/Extraction.scala

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,20 @@ case class Extraction(
2323
user_id: UUID = User.anonymous.id
2424
)
2525

26+
// Used in extraction report aggregation
27+
case class ExtractionJob(
28+
target: String,
29+
targetType: String,
30+
extractor: String,
31+
spaces: String,
32+
jobId: String,
33+
jobType: String,
34+
statusCount: Int,
35+
lastStatus: String,
36+
start: Date,
37+
end: Date
38+
)
39+
2640
/**
2741
* Currently running extractor name
2842
*/

app/services/DatasetService.scala

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -383,6 +383,7 @@ trait DatasetService {
383383

384384
def incrementDownloads(id: UUID, user: Option[User])
385385

386-
def getIterator(space: Option[UUID], since: Option[String], until: Option[String]): Iterator[Dataset]
386+
def getIterator(space: Option[String], since: Option[String], until: Option[String]): Iterator[Dataset]
387387

388+
def getTrashedIds(): List[UUID]
388389
}

app/services/ExtractionService.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ trait ExtractionService {
1818

1919
def get(msgId: UUID): Option[Extraction]
2020

21+
def getIterator(userRequired: Boolean, since: Option[String], until: Option[String], user: Option[UUID]): Iterator[Extraction]
22+
2123
def findById(resource: ResourceRef): List[Extraction]
2224

2325
def findByExtractorIDBefore(extractorID: String, status: String, date: String, limit: Int): List[Extraction]

app/services/FileService.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,6 @@ trait FileService {
246246

247247
def incrementDownloads(id: UUID, user: Option[User])
248248

249-
def getIterator(space: Option[UUID], since: Option[String], until: Option[String]): Iterator[File]
249+
def getIterator(space: Option[String], since: Option[String], until: Option[String]): Iterator[File]
250250

251251
}

app/services/mongodb/MongoDBCollectionService.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -894,7 +894,7 @@ class MongoDBCollectionService @Inject() (
894894

895895
def indexAll(idx: Option[String] = None) = {
896896
// Bypass Salat in case any of the file records are malformed to continue past them
897-
Collection.dao.collection.find(MongoDBObject(), MongoDBObject("_id" -> 1)).foreach(c => {
897+
Collection.dao.collection.find(MongoDBObject("trash" -> false), MongoDBObject("_id" -> 1)).foreach(c => {
898898
index(new UUID(c.get("_id").toString), idx)
899899
})
900900
}

app/services/mongodb/MongoDBDatasetService.scala

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1412,7 +1412,7 @@ class MongoDBDatasetService @Inject() (
14121412

14131413
def indexAll(idx: Option[String] = None) = {
14141414
// Bypass Salat in case any of the file records are malformed to continue past them
1415-
Dataset.dao.collection.find(MongoDBObject(), MongoDBObject("_id" -> 1)).foreach(d => {
1415+
Dataset.dao.collection.find(MongoDBObject("trash" -> false), MongoDBObject("_id" -> 1)).foreach(d => {
14161416
index(new UUID(d.get("_id").toString), idx)
14171417
})
14181418
}
@@ -1641,13 +1641,23 @@ class MongoDBDatasetService @Inject() (
16411641
* @param since - include only datasets created after a certain date
16421642
* @param until - include only datasets created before a certain date
16431643
*/
1644-
def getIterator(space: Option[UUID], since: Option[String], until: Option[String]): Iterator[Dataset] = {
1644+
def getIterator(space: Option[String], since: Option[String], until: Option[String]): Iterator[Dataset] = {
16451645
var query = MongoDBObject("trash" -> false)
1646-
space.foreach(spid => query += ("spaces" -> new ObjectId(spid.stringify)))
1646+
space.foreach(spid => query += ("spaces" -> new ObjectId(spid)))
16471647
since.foreach(t => query = query ++ ("created" $gte Parsers.fromISO8601(t)))
16481648
until.foreach(t => query = query ++ ("created" $lte Parsers.fromISO8601(t)))
16491649
Dataset.find(query)
16501650
}
1651+
1652+
// Get a list of all trashed dataset and file ids for comparison
1653+
def getTrashedIds(): List[UUID] = {
1654+
val trashedIds = ListBuffer[UUID]()
1655+
Dataset.find(MongoDBObject("trash" -> true)).map(ds => {
1656+
ds.files.foreach(fid => trashedIds += fid)
1657+
trashedIds += ds.id
1658+
})
1659+
trashedIds.toList
1660+
}
16511661
}
16521662

16531663
object Dataset extends ModelCompanion[Dataset, ObjectId] {

app/services/mongodb/MongoDBExtractionService.scala

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,19 @@ package services.mongodb
33
import java.text.SimpleDateFormat
44

55
import services.ExtractionService
6-
import models.{UUID, Extraction, ExtractionGroup, ResourceRef}
6+
import models.{Extraction, ExtractionGroup, ResourceRef, UUID}
77
import org.bson.types.ObjectId
88
import play.api.Play.current
99
import com.novus.salat.dao.ModelCompanion
1010
import com.novus.salat.dao.SalatDAO
1111
import MongoContext.context
1212
import com.mongodb.casbah.commons.MongoDBObject
1313
import java.util.Date
14+
1415
import play.api.Logger
1516
import models.WebPageResource
1617
import com.mongodb.casbah.Imports._
18+
import util.Parsers
1719

1820
/**
1921
* Use MongoDB to store extractions
@@ -37,6 +39,15 @@ class MongoDBExtractionService extends ExtractionService {
3739
Extraction.findOne(MongoDBObject("id" -> new ObjectId(msgId.stringify)))
3840
}
3941

42+
def getIterator(userRequired: Boolean, since: Option[String], until: Option[String], user: Option[UUID]): Iterator[Extraction] = {
43+
var query = MongoDBObject()
44+
if (userRequired) query = query ++ ("user_id" $exists true)
45+
since.foreach(t => query = query ++ ("start" $gte Parsers.fromISO8601(t)))
46+
until.foreach(t => query = query ++ ("start" $lte Parsers.fromISO8601(t)))
47+
user.foreach(uid => query = query ++ ("user_id" -> new ObjectId(uid.stringify)))
48+
Extraction.find(query).toIterator
49+
}
50+
4051
def findById(resource: ResourceRef): List[Extraction] = {
4152
Extraction.find(MongoDBObject("file_id" -> new ObjectId(resource.id.stringify))).toList
4253
}

app/services/mongodb/MongoDBFileService.scala

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -326,8 +326,11 @@ class MongoDBFileService @Inject() (
326326

327327
def indexAll(idx: Option[String] = None) = {
328328
// Bypass Salat in case any of the file records are malformed to continue past them
329+
val trashedIds = datasets.getTrashedIds()
329330
FileDAO.dao.collection.find(MongoDBObject(), MongoDBObject("_id" -> 1)).foreach(f => {
330-
index(new UUID(f.get("_id").toString), idx)
331+
val fid = new UUID(f.get("_id").toString)
332+
if (!trashedIds.contains(fid))
333+
index(fid, idx)
331334
})
332335
}
333336

@@ -1215,7 +1218,7 @@ class MongoDBFileService @Inject() (
12151218
}
12161219
}
12171220

1218-
def getIterator(space: Option[UUID], since: Option[String], until: Option[String]): Iterator[File] = {
1221+
def getIterator(space: Option[String], since: Option[String], until: Option[String]): Iterator[File] = {
12191222
var query = MongoDBObject()
12201223
space.foreach(spid => {
12211224
// If space is specified, we have to get that association from datasets for now

0 commit comments

Comments
 (0)