Skip to content

Commit ed02dec

Browse files
committed
Count only for the first hit for subject or tissue within filename
Also added assertion so we do not count incorrectly. But may be should be just a warning? Closes #172
1 parent b310e3e commit ed02dec

File tree

1 file changed

+11
-5
lines changed

1 file changed

+11
-5
lines changed

dandischema/metadata.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -311,13 +311,16 @@ def _add_asset_to_stats(assetmeta: Dict[str, Any], stats: _stats_type) -> None:
311311
stats = _get_samples(value, stats, hierarchy)
312312
break
313313

314+
# which components already found, so we do not count more than
315+
# once in some incorrectly named datasets
316+
found: Dict[str, str] = {}
314317
for part in Path(assetmeta["path"]).name.split(".")[0].split("_"):
315-
if part.startswith("sub-"):
316-
subject = part.replace("sub-", "")
318+
if found.get("subject") and part.startswith("sub-"):
319+
found["subject"] = subject = part.split("sub-", 1)[1]
317320
if subject not in stats["subjects"]:
318321
stats["subjects"].append(subject)
319-
if part.startswith("sample-"):
320-
sample = part.replace("sample-", "")
322+
if not found.get("sample") and part.startswith("sample-"):
323+
found["sample"] = sample = part.replace("sample-", "")
321324
if sample not in stats["tissuesample"]:
322325
stats["tissuesample"].append(sample)
323326

@@ -338,10 +341,13 @@ def aggregate_assets_summary(metadata: Iterable[Dict[str, Any]]) -> dict:
338341
stats: _stats_type = {}
339342
for meta in metadata:
340343
_add_asset_to_stats(meta, stats)
341-
342344
stats["numberOfBytes"] = stats.get("numberOfBytes", 0)
343345
stats["numberOfFiles"] = stats.get("numberOfFiles", 0)
344346
stats["numberOfSubjects"] = len(stats.pop("subjects", [])) or None
347+
if stats["numberOfSubjects"]:
348+
# Must not happen. If does -- a bug in software
349+
assert stats["numberOfFiles"]
350+
assert stats["numberOfSubjects"] <= stats["numberOfFiles"]
345351
stats["numberOfSamples"] = (
346352
len(stats.pop("tissuesample", [])) + len(stats.pop("slice", []))
347353
) or None

0 commit comments

Comments
 (0)