Skip to content

Commit 6f9bd7b

Browse files
Fix deduplication of mags/attachments to scan for storage (#8981)
This PR is a follow up fix for #8789. The problem with #8789 is that it allowed mags to be measured for storage amount multiple times if used across multiple times across multipe organizations. This PR fixes the logical bug in the sql query. Same goes for attachments. ### URL of deployed dev instance (used for testing): - https://___.webknossos.xyz ### Steps to test: - I suggest to test locally. - On master: Create a new orga (e.g. via `isWkOrgInstance=true` in `application.conf`); - also reduce `fetchUsedStorage` time in `application.conf`. e.g. `rescanInterval = 30 seconds` `tickerInterval = 10 seconds` - Symlink a dataset from the sample_organization into the new orga - Trigger a dataset rescan in the new orga - The symlinked dataset's mags should now appear in ``` SELECT * FROM (SELECT COUNT(*) AS count, COALESCE(realPath, path) as ppath FROM webknossos.dataset_mags GROUP BY ppath) WHERE count > 1 ``` - Check the `webknossos.organization_usedstorage_mags` table. It should list duplicate measurements for the mag paths of the symlinked dataset. ``` SELECT * FROM (SELECT COUNT(*) AS count, path FROM webknossos.organization_usedstorage_mags GROUP BY path) WHERE count > 1 ``` - Now checkout this branch. !Do not refresh the schema! - Run this branch's evolution - Wait a bit for the storage rescan and check the `organization_usedstorage_mags` table again. There should no longer be duplicates. The following query's set should be empty now ``` SELECT * FROM (SELECT COUNT(*) AS count, path FROM webknossos.organization_usedstorage_mags GROUP BY path) WHERE count > 1 ``` ### Issues: - fixes https://scm.slack.com/archives/C5AKLAV0B/p1759761569291259; follow up for #8789 ------ (Please delete unneeded items, merge only when none are left open) - [x] Added changelog entry (create a `$PR_NUMBER.md` file in `unreleased_changes` or use `./tools/create-changelog-entry.py`) - [x] Added migration guide entry if applicable (edit the same file as for the changelog) - [x] Needs datastore update after deployment --------- Co-authored-by: Florian M <[email protected]>
1 parent 1eea8e7 commit 6f9bd7b

File tree

6 files changed

+79
-31
lines changed

6 files changed

+79
-31
lines changed

app/models/dataset/Dataset.scala

Lines changed: 48 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -778,19 +778,31 @@ class DatasetMagsDAO @Inject()(sqlClient: SqlClient)(implicit ec: ExecutionConte
778778
dataStoreId: String,
779779
datasetIdOpt: Option[ObjectId]): Fox[List[DataSourceMagRow]] =
780780
for {
781-
storageRelevantMags <- run(q"""SELECT
782-
dataset_id, dataLayerName, mag, path, realPath, hasLocalData, _organization, directoryName
783-
FROM (
784-
-- rn is the rank of the mags with the same path. We only retrieve the oldest mag with the same path to measure each mag only once.
785-
SELECT ds._id AS dataset_id, mag.dataLayerName, mag.mag, mag.path, mag.realPath, mag.hasLocalData,
786-
ds._organization, ds.directoryName, ROW_NUMBER() OVER (PARTITION BY COALESCE(mag.realPath, mag.path) ORDER BY ds.created ASC) AS rn
781+
storageRelevantMags <- run(q"""
782+
WITH ranked AS (
783+
SELECT
784+
ds._id AS dataset_id, mag.dataLayerName, mag.mag, mag.path, mag.realPath, mag.hasLocalData,
785+
ds._organization, ds._dataStore, ds.directoryName,
786+
-- rn is the rank of the mags with the same path. It is used to deduplicate mags with the same path to
787+
-- count each physical mag only once. Filtering is done below.
788+
ROW_NUMBER() OVER (
789+
PARTITION BY COALESCE(mag.realPath, mag.path)
790+
ORDER BY ds.created ASC
791+
) AS rn
787792
FROM webknossos.dataset_mags AS mag
788-
JOIN webknossos.datasets AS ds ON mag._dataset = ds._id
789-
WHERE ds._organization = $organizationId
790-
AND ds._dataStore = $dataStoreId
791-
${datasetIdOpt.map(datasetId => q"AND ds._id = $datasetId").getOrElse(q"")}
792-
) AS ranked
793-
WHERE rn = 1;""".as[DataSourceMagRow])
793+
JOIN webknossos.datasets AS ds
794+
ON mag._dataset = ds._id
795+
)
796+
SELECT
797+
dataset_id, dataLayerName, mag, path, realPath, hasLocalData, _organization, directoryName
798+
FROM ranked
799+
-- Filter !after! grouping mags with the same path,
800+
-- so mags shared between organizations are deduplicated properly using rn.
801+
WHERE rn = 1
802+
AND ranked._organization = $organizationId
803+
AND ranked._dataStore = $dataStoreId
804+
${datasetIdOpt.map(datasetId => q"AND ranked.dataset_id = $datasetId").getOrElse(q"")};
805+
""".as[DataSourceMagRow])
794806
} yield storageRelevantMags.toList
795807

796808
def updateMags(datasetId: ObjectId, dataLayers: List[StaticLayer]): Fox[Unit] = {
@@ -1221,23 +1233,30 @@ class DatasetLayerAttachmentsDAO @Inject()(sqlClient: SqlClient)(implicit ec: Ex
12211233
dataStoreId: String,
12221234
datasetIdOpt: Option[ObjectId]): Fox[List[StorageRelevantDataLayerAttachment]] =
12231235
for {
1224-
storageRelevantAttachments <- run(q"""SELECT
1225-
_dataset, layerName, name, path, type, _organization, directoryName
1226-
FROM (
1227-
-- rn is the rank of the attachments with the same path. We only retrieve the
1228-
-- oldest attachment with the same path to measuring an attachment only once.
1229-
SELECT
1230-
att._dataset, att.layerName, att.name, att.path, att.type, ds._organization, ds.directoryName,
1231-
ROW_NUMBER() OVER (PARTITION BY att.path ORDER BY ds.created ASC) AS rn
1232-
FROM webknossos.dataset_layer_attachments AS att
1233-
JOIN webknossos.datasets AS ds ON att._dataset = ds._id
1234-
WHERE ds._organization = $organizationId
1235-
AND ds._dataStore = $dataStoreId
1236-
${datasetIdOpt
1237-
.map(datasetId => q"AND ds._id = $datasetId")
1238-
.getOrElse(q"")}
1239-
) AS ranked
1240-
WHERE rn = 1;""".as[StorageRelevantDataLayerAttachment])
1236+
storageRelevantAttachments <- run(q"""
1237+
WITH ranked AS (
1238+
SELECT
1239+
att._dataset, att.layerName, att.name, att.path, att.type, ds._organization, ds._dataStore, ds.directoryName,
1240+
-- rn is the rank of the attachments with the same path. It is used to deduplicate attachments with the same path
1241+
-- to count each physical attachment only once. Filtering is done below.
1242+
ROW_NUMBER() OVER (
1243+
PARTITION BY att.path
1244+
ORDER BY ds.created ASC
1245+
) AS rn
1246+
FROM webknossos.dataset_layer_attachments AS att
1247+
JOIN webknossos.datasets AS ds
1248+
ON att._dataset = ds._id
1249+
)
1250+
SELECT
1251+
_dataset, layerName, name, path, type, _organization, directoryName
1252+
FROM ranked
1253+
-- Filter !after! grouping attachments with the same path,
1254+
-- so attachments shared between organizations are deduplicated properly using rn.
1255+
WHERE ranked.rn = 1
1256+
AND ranked._organization = $organizationId
1257+
AND ranked._dataStore = $dataStoreId
1258+
${datasetIdOpt.map(datasetId => q"AND ranked._dataset = $datasetId").getOrElse(q"")};
1259+
""".as[StorageRelevantDataLayerAttachment])
12411260
} yield storageRelevantAttachments.toList
12421261
}
12431262

conf/application.conf

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,8 @@ datastore {
245245
# For GoogleServiceAccount the secret is a json string.
246246
# The credentials are selected by uri prefix, so different s3 uri styles may need duplicated credential entries.
247247
credentials = []
248+
# Use for local dev scenarios (needs env variable to be defined).
249+
# credentials = [{type = "S3AccessKey", name = ${?S3_BUCKET}, identifier = ${?S3_ACCESS_KEY_ID}, secret = ${?S3_SECRET_ACCESS_KEY}}]
248250
}
249251
}
250252

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
START TRANSACTION;
2+
3+
do $$ begin if (select schemaVersion from webknossos.releaseInformation) <> 143 then raise exception 'Previous schema version mismatch'; end if; end; $$ language plpgsql;
4+
5+
CREATE INDEX ON webknossos.dataset_mags(COALESCE(realPath, path));
6+
CREATE INDEX ON webknossos.dataset_layer_attachments(path);
7+
8+
UPDATE webknossos.releaseInformation SET schemaVersion = 144;
9+
10+
COMMIT TRANSACTION;
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
START TRANSACTION;
2+
3+
do $$ begin if (select schemaVersion from webknossos.releaseInformation) <> 144 then raise exception 'Previous schema version mismatch'; end if; end; $$ language plpgsql;
4+
5+
DROP INDEX IF EXISTS webknossos.dataset_mags_coalesce_idx;
6+
DROP INDEX IF EXISTS webknossos.dataset_layer_attachments_path_idx;
7+
8+
UPDATE webknossos.releaseInformation SET schemaVersion = 143;
9+
10+
11+
COMMIT TRANSACTION;

tools/postgres/schema.sql

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ CREATE TABLE webknossos.releaseInformation (
2121
schemaVersion BIGINT NOT NULL
2222
);
2323

24-
INSERT INTO webknossos.releaseInformation(schemaVersion) values(143);
24+
INSERT INTO webknossos.releaseInformation(schemaVersion) values(144);
2525
COMMIT TRANSACTION;
2626

2727

@@ -849,10 +849,11 @@ CREATE INDEX ON webknossos.invites(tokenValue);
849849
CREATE INDEX ON webknossos.annotation_privateLinks(accessToken);
850850
CREATE INDEX ON webknossos.shortLinks(key);
851851
CREATE INDEX ON webknossos.credit_transactions(credit_state);
852+
CREATE INDEX ON webknossos.dataset_mags(COALESCE(realPath, path));
853+
CREATE INDEX ON webknossos.dataset_layer_attachments(path);
852854
CREATE INDEX ON webknossos.organization_usedStorage_mags(_organization);
853855
CREATE INDEX ON webknossos.organization_usedStorage_attachments(_organization);
854856

855-
856857
ALTER TABLE webknossos.annotations
857858
ADD CONSTRAINT task_ref FOREIGN KEY(_task) REFERENCES webknossos.tasks(_id) ON DELETE SET NULL DEFERRABLE,
858859
ADD CONSTRAINT team_ref FOREIGN KEY(_team) REFERENCES webknossos.teams(_id) DEFERRABLE,

unreleased_changes/8981.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
### Fixed
2+
- Fixed a bug where storage scans for dataset mags & attachments used accross multiple organizations would count some paths twice.
3+
4+
### Postgres Evolutions
5+
[144-improve-storage-scan.sql](conf/evolutions/144-improve-storage-scan.sql)

0 commit comments

Comments
 (0)