Skip to content

Commit 5aa9890

Browse files
committed
Implement deletion of S3 datasets
1 parent 5c64063 commit 5aa9890

File tree

4 files changed

+148
-15
lines changed

4 files changed

+148
-15
lines changed

app/models/dataset/DatasetService.scala

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -671,16 +671,16 @@ class DatasetService @Inject()(organizationDAO: OrganizationDAO,
671671

672672
def deleteVirtualOrDiskDataset(dataset: Dataset)(implicit ctx: DBAccessContext): Fox[Unit] =
673673
for {
674-
_ <- if (dataset.isVirtual) {
675-
// At this point, we should also free space in S3 once implemented.
676-
// Right now, we can just mark the dataset as deleted in the database.
677-
datasetDAO.deleteDataset(dataset._id, onlyMarkAsDeleted = true)
678-
} else {
679-
for {
680-
datastoreClient <- clientFor(dataset)
681-
_ <- datastoreClient.deleteOnDisk(dataset._id)
682-
} yield ()
683-
} ?~> "dataset.delete.failed"
674+
//_ <- if (dataset.isVirtual) {
675+
// At this point, we should also free space in S3 once implemented.
676+
// Right now, we can just mark the dataset as deleted in the database.
677+
// datasetDAO.deleteDataset(dataset._id, onlyMarkAsDeleted = true)
678+
//} else {
679+
//for {
680+
datastoreClient <- clientFor(dataset)
681+
_ <- datastoreClient.deleteOnDisk(dataset._id) ?~> "dataset.delete.failed"
682+
// } yield ()
683+
//} ?~> "dataset.delete.failed"
684684
} yield ()
685685

686686
def publicWrites(dataset: Dataset,

webknossos-datastore/app/com/scalableminds/webknossos/datastore/controllers/DataSourceController.scala

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -448,20 +448,22 @@ class DataSourceController @Inject()(
448448
for {
449449
dataSource <- datasetCache.getById(datasetId) ~> NOT_FOUND
450450
dataSourceId = dataSource.id
451-
_ <- if (dataSourceService.existsOnDisk(dataSourceId.organizationId, dataSourceId.directoryName)) {
451+
existsOnDisk = dataSourceService.existsOnDisk(dataSourceId.organizationId, dataSourceId.directoryName)
452+
_ <- if (existsOnDisk) {
452453
for {
453454
_ <- dataSourceService.deleteOnDisk(
454455
dataSourceId.organizationId,
455456
dataSourceId.directoryName,
456457
Some(datasetId),
457458
reason = Some("the user wants to delete the dataset")) ?~> "dataset.delete.failed"
458-
_ <- dsRemoteWebknossosClient.deleteDataSource(dataSourceId)
459459
} yield ()
460460
} else
461461
for {
462+
_ <- Fox.runIf(dataSourceService.datasetInControlledS3(dataSource))(dataSourceService.deleteFromControlledS3(dataSource))
462463
_ <- dsRemoteWebknossosClient.deleteDataSource(dataSourceId)
463464
_ = logger.warn(s"Tried to delete dataset ${dataSource.id} that is not on disk.")
464465
} yield ()
466+
_ <- dsRemoteWebknossosClient.deleteDataSource(dataSourceId)
465467
} yield Ok
466468
}
467469
}

webknossos-datastore/app/com/scalableminds/webknossos/datastore/datavault/S3DataVault.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,8 @@ object S3DataVault {
159159
new S3DataVault(credential, remoteSourceDescriptor.uri, ws, ec)
160160
}
161161

162-
private def hostBucketFromUri(uri: URI): Option[String] = {
162+
// TODO: Move non private methods to trait?
163+
def hostBucketFromUri(uri: URI): Option[String] = {
163164
val host = uri.getHost
164165
if (isShortStyle(uri)) { // assume host is omitted from uri, shortcut form s3://bucket/key
165166
Some(host)
@@ -185,7 +186,7 @@ object S3DataVault {
185186
private def isShortStyle(uri: URI): Boolean =
186187
!uri.getHost.contains(".")
187188

188-
private def objectKeyFromUri(uri: URI): Box[String] =
189+
def objectKeyFromUri(uri: URI): Box[String] =
189190
if (isVirtualHostedStyle(uri)) {
190191
Full(uri.getPath)
191192
} else if (isPathStyle(uri)) {

webknossos-datastore/app/com/scalableminds/webknossos/datastore/services/DataSourceService.scala

Lines changed: 131 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,19 +13,38 @@ import com.scalableminds.webknossos.datastore.dataformats.{MagLocator, MappingPr
1313
import com.scalableminds.webknossos.datastore.helpers.{DatasetDeleter, IntervalScheduler}
1414
import com.scalableminds.webknossos.datastore.models.datasource._
1515
import com.scalableminds.webknossos.datastore.models.datasource.inbox.{InboxDataSource, UnusableDataSource}
16-
import com.scalableminds.webknossos.datastore.storage.{DataVaultService, RemoteSourceDescriptorService}
16+
import com.scalableminds.webknossos.datastore.storage.{
17+
CredentialConfigReader,
18+
DataVaultService,
19+
RemoteSourceDescriptorService,
20+
S3AccessKeyCredential
21+
}
1722
import com.typesafe.scalalogging.LazyLogging
1823
import com.scalableminds.util.tools.Box.tryo
1924
import com.scalableminds.util.tools._
25+
import com.scalableminds.webknossos.datastore.datavault.S3DataVault
2026
import play.api.inject.ApplicationLifecycle
2127
import play.api.libs.json.Json
28+
import software.amazon.awssdk.auth.credentials.{AwsBasicCredentials, StaticCredentialsProvider}
29+
import software.amazon.awssdk.core.checksums.RequestChecksumCalculation
30+
import software.amazon.awssdk.regions.Region
31+
import software.amazon.awssdk.services.s3.S3AsyncClient
32+
import software.amazon.awssdk.services.s3.model.{
33+
Delete,
34+
DeleteObjectsRequest,
35+
DeleteObjectsResponse,
36+
ListObjectsV2Request,
37+
ObjectIdentifier
38+
}
2239

2340
import java.io.{File, FileWriter}
2441
import java.net.URI
2542
import java.nio.file.{Files, Path}
2643
import scala.concurrent.ExecutionContext
2744
import scala.concurrent.duration._
2845
import scala.io.Source
46+
import scala.jdk.CollectionConverters._
47+
import scala.jdk.FutureConverters._
2948

3049
class DataSourceService @Inject()(
3150
config: DataStoreConfig,
@@ -446,4 +465,115 @@ class DataSourceService @Inject()(
446465
remoteSourceDescriptorService.removeVaultFromCache(attachment)))
447466
} yield dataLayer.mags.length
448467
} yield removedEntriesList.sum
468+
469+
private lazy val globalCredentials = {
470+
val res = config.Datastore.DataVaults.credentials.flatMap { credentialConfig =>
471+
new CredentialConfigReader(credentialConfig).getCredential
472+
}
473+
logger.info(s"Parsed ${res.length} global data vault credentials from datastore config.")
474+
res
475+
}
476+
477+
def datasetInControlledS3(dataSource: DataSource) = {
478+
def commonPrefix(strings: Seq[String]): String = {
479+
if (strings.isEmpty) return ""
480+
481+
strings.reduce { (a, b) =>
482+
a.zip(b).takeWhile { case (c1, c2) => c1 == c2 }.map(_._1).mkString
483+
}
484+
}
485+
486+
val allPaths = dataSource.allExplicitPaths
487+
val sharedPath = commonPrefix(allPaths)
488+
val matchingCredentials = globalCredentials.filter(c => sharedPath.startsWith(c.name))
489+
matchingCredentials.nonEmpty && sharedPath.startsWith("s3")
490+
}
491+
492+
private lazy val s3UploadCredentialsOpt: Option[(String, String)] =
493+
config.Datastore.DataVaults.credentials.flatMap { credentialConfig =>
494+
new CredentialConfigReader(credentialConfig).getCredential
495+
}.collectFirst {
496+
case S3AccessKeyCredential(credentialName, accessKeyId, secretAccessKey, _, _)
497+
if config.Datastore.S3Upload.credentialName == credentialName =>
498+
(accessKeyId, secretAccessKey)
499+
}
500+
private lazy val s3Client: S3AsyncClient = S3AsyncClient
501+
.builder()
502+
.credentialsProvider(
503+
StaticCredentialsProvider.create(
504+
AwsBasicCredentials.builder
505+
.accessKeyId(s3UploadCredentialsOpt.getOrElse(("", ""))._1)
506+
.secretAccessKey(s3UploadCredentialsOpt.getOrElse(("", ""))._2)
507+
.build()
508+
))
509+
.crossRegionAccessEnabled(true)
510+
.forcePathStyle(true)
511+
.endpointOverride(new URI(config.Datastore.S3Upload.endpoint))
512+
.region(Region.US_EAST_1)
513+
// Disabling checksum calculation prevents files being stored with Content Encoding "aws-chunked".
514+
.requestChecksumCalculation(RequestChecksumCalculation.WHEN_REQUIRED)
515+
.build()
516+
517+
def deleteFromControlledS3(dataSource: DataSource): Fox[Unit] = {
518+
// TODO: Do we handle other datasets using the same layers?
519+
520+
def deleteBatch(bucket: String, keys: Seq[String]): Fox[DeleteObjectsResponse] =
521+
if (keys.isEmpty) Fox.empty
522+
else {
523+
Fox.fromFuture(
524+
s3Client
525+
.deleteObjects(
526+
DeleteObjectsRequest
527+
.builder()
528+
.bucket(bucket)
529+
.delete(
530+
Delete
531+
.builder()
532+
.objects(
533+
keys.map(k => ObjectIdentifier.builder().key(k).build()).asJava
534+
)
535+
.build()
536+
)
537+
.build()
538+
)
539+
.asScala)
540+
}
541+
542+
def listKeysAtPrefix(bucket: String, prefix: String): Fox[Seq[String]] = {
543+
def listRec(continuationToken: Option[String], acc: Seq[String]): Fox[Seq[String]] = {
544+
val builder = ListObjectsV2Request.builder().bucket(bucket).prefix(prefix).maxKeys(1000)
545+
val request = continuationToken match {
546+
case Some(token) => builder.continuationToken(token).build()
547+
case None => builder.build()
548+
}
549+
for {
550+
response <- Fox.fromFuture(s3Client.listObjectsV2(request).asScala)
551+
keys = response.contents().asScala.map(_.key())
552+
allKeys = acc ++ keys
553+
result <- if (response.isTruncated) {
554+
listRec(Option(response.nextContinuationToken()), allKeys)
555+
} else {
556+
Fox.successful(allKeys)
557+
}
558+
} yield result
559+
}
560+
listRec(None, Seq())
561+
}
562+
563+
for {
564+
_ <- Fox.successful(())
565+
paths = dataSource.allExplicitPaths
566+
// Assume everything is in the same bucket
567+
firstPath <- paths.headOption.toFox ?~> "No explicit paths found for dataset in controlled S3"
568+
bucket <- S3DataVault
569+
.hostBucketFromUri(new URI(firstPath))
570+
.toFox ?~> s"Could not determine S3 bucket from path $firstPath"
571+
prefixes <- Fox.combined(paths.map(path => S3DataVault.objectKeyFromUri(new URI(path)).toFox))
572+
keys: Seq[String] <- Fox.serialCombined(prefixes)(listKeysAtPrefix(bucket, _)).map(_.flatten)
573+
uniqueKeys = keys.distinct
574+
_ = logger.info(
575+
s"Deleting ${uniqueKeys.length} objects from controlled S3 bucket $bucket for dataset ${dataSource.id}")
576+
_ <- Fox.serialCombined(uniqueKeys.grouped(1000).toSeq)(deleteBatch(bucket, _)).map(_ => ())
577+
} yield ()
578+
}
449579
}

0 commit comments

Comments
 (0)