Skip to content

Commit 95f12d9

Browse files
committed
Merge remote-tracking branch 'origin/enh/googledataset' into enh/googledataset
* origin/enh/googledataset: [pre-commit.ci] auto fixes from pre-commit.com hooks
2 parents e26dad9 + 27a1f07 commit 95f12d9

File tree

2 files changed

+67
-62
lines changed

2 files changed

+67
-62
lines changed

dandischema/tests/test_google_dataset_metadata.py

Lines changed: 30 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import copy
2-
from typing import Dict, Any
2+
from typing import Any, Dict
33

44
import pytest
55

@@ -22,15 +22,15 @@ def sample_dandiset_metadata() -> Dict[str, Any]:
2222
"roleName": ["dcite:Author", "dcite:ContactPerson"],
2323
"identifier": "0000-0001-2345-6789",
2424
"email": "john.doe@example.com",
25-
"includeInCitation": True
25+
"includeInCitation": True,
2626
},
2727
{
2828
"schemaKey": "Organization",
2929
"name": "Test Organization",
3030
"roleName": ["dcite:Sponsor"],
3131
"identifier": "https://ror.org/xxxxxxxxx",
32-
"includeInCitation": False
33-
}
32+
"includeInCitation": False,
33+
},
3434
],
3535
"license": ["spdx:CC-BY-4.0"],
3636
"schemaVersion": "0.6.4",
@@ -41,38 +41,38 @@ def sample_dandiset_metadata() -> Dict[str, Any]:
4141
"dataStandard": [
4242
{
4343
"name": "Neurodata Without Borders (NWB)",
44-
"identifier": "RRID:SCR_015242"
44+
"identifier": "RRID:SCR_015242",
4545
}
4646
],
4747
"species": [
4848
{
4949
"name": "Homo sapiens",
50-
"identifier": "http://purl.obolibrary.org/obo/NCBITaxon_9606"
50+
"identifier": "http://purl.obolibrary.org/obo/NCBITaxon_9606",
5151
}
5252
],
5353
"approach": [
5454
{
5555
"name": "electrophysiology",
56-
"identifier": "http://uri.interlex.org/base/ilx_0739363"
56+
"identifier": "http://uri.interlex.org/base/ilx_0739363",
5757
}
5858
],
5959
"measurementTechnique": [
6060
{
6161
"name": "multi-electrode extracellular electrophysiology",
62-
"identifier": "http://uri.interlex.org/base/ilx_0739400"
62+
"identifier": "http://uri.interlex.org/base/ilx_0739400",
6363
}
64-
]
65-
}
64+
],
65+
},
6666
}
6767

6868

6969
def test_google_dataset_metadata_basic_transformation(sample_dandiset_metadata):
7070
"""Test that the basic transformation works correctly"""
7171
result = google_dataset_metadata(sample_dandiset_metadata)
72-
72+
7373
# Check that the original metadata is not modified
7474
assert sample_dandiset_metadata != result
75-
75+
7676
# Check that schema:Dataset is added to schemaKey
7777
assert "schema:Dataset" in result["schemaKey"]
7878

@@ -85,19 +85,19 @@ def test_google_dataset_metadata_basic_transformation(sample_dandiset_metadata):
8585
creator = result["schema:creator"][0]
8686
assert creator["schemaKey"] == "schema:Person"
8787
assert "name" in creator
88-
88+
8989
# Check that license is properly formatted
9090
assert "license" in result
9191
assert isinstance(result["license"], list)
9292
assert "https://spdx.org/licenses/CC-BY-4.0" in result["license"]
93-
93+
9494
# Check that version is present
9595
assert "version" in result
96-
96+
9797
# Check that identifier is properly formatted
9898
assert "identifier" in result
9999
assert result["identifier"] == "https://identifiers.org/DANDI:000707"
100-
100+
101101
# Check that keywords exist
102102
assert "keywords" in result
103103
assert isinstance(result["keywords"], list)
@@ -110,7 +110,7 @@ def test_google_dataset_metadata_preserves_original(sample_dandiset_metadata):
110110
"""Test that the original metadata is not modified"""
111111
original = copy.deepcopy(sample_dandiset_metadata)
112112
google_dataset_metadata(sample_dandiset_metadata)
113-
113+
114114
# Verify the original is unchanged
115115
assert original == sample_dandiset_metadata
116116

@@ -122,12 +122,12 @@ def test_google_dataset_metadata_with_existing_creator(sample_dandiset_metadata)
122122
{
123123
"schemaKey": "Person",
124124
"name": "Jane Smith",
125-
"identifier": "https://orcid.org/0000-0002-3456-7890"
125+
"identifier": "https://orcid.org/0000-0002-3456-7890",
126126
}
127127
]
128-
128+
129129
result = google_dataset_metadata(sample_dandiset_metadata)
130-
130+
131131
# Check that the existing creator is preserved
132132
assert result["creator"] == sample_dandiset_metadata["creator"]
133133

@@ -136,13 +136,13 @@ def test_google_dataset_metadata_with_existing_keywords(sample_dandiset_metadata
136136
"""Test that existing keywords are preserved and extended"""
137137
# Add keywords field
138138
sample_dandiset_metadata["keywords"] = ["test", "example"]
139-
139+
140140
result = google_dataset_metadata(sample_dandiset_metadata)
141-
141+
142142
# Check that the existing keywords are preserved
143143
assert "test" in result["keywords"]
144144
assert "example" in result["keywords"]
145-
145+
146146
# Check that additional keywords are added
147147
assert "neuroscience" in result["keywords"]
148148
assert "DANDI" in result["keywords"]
@@ -153,9 +153,9 @@ def test_google_dataset_metadata_with_no_license(sample_dandiset_metadata):
153153
# Remove license field
154154
no_license_metadata = copy.deepcopy(sample_dandiset_metadata)
155155
del no_license_metadata["license"]
156-
156+
157157
result = google_dataset_metadata(no_license_metadata)
158-
158+
159159
# Check that license is not in the result
160160
assert "license" not in result
161161

@@ -165,7 +165,7 @@ def test_google_dataset_metadata_with_no_contributors(sample_dandiset_metadata):
165165
# Remove contributor field
166166
no_contributor_metadata = copy.deepcopy(sample_dandiset_metadata)
167167
del no_contributor_metadata["contributor"]
168-
168+
169169
result = google_dataset_metadata(no_contributor_metadata)
170170

171171
# Check that schema:creator is not in the result
@@ -176,9 +176,9 @@ def test_google_dataset_metadata_with_date_published(sample_dandiset_metadata):
176176
"""Test handling of datePublished field"""
177177
# Add datePublished field
178178
sample_dandiset_metadata["datePublished"] = "2023-01-01T00:00:00Z"
179-
179+
180180
result = google_dataset_metadata(sample_dandiset_metadata)
181-
181+
182182
# Check that datePublished is preserved
183183
assert result["datePublished"] == "2023-01-01T00:00:00Z"
184184

@@ -187,8 +187,8 @@ def test_google_dataset_metadata_with_date_created_fallback(sample_dandiset_meta
187187
"""Test fallback to dateCreated when datePublished is not present"""
188188
# Add dateCreated field
189189
sample_dandiset_metadata["dateCreated"] = "2022-01-01T00:00:00Z"
190-
190+
191191
result = google_dataset_metadata(sample_dandiset_metadata)
192-
192+
193193
# Check that datePublished is set to dateCreated
194194
assert result["datePublished"] == "2022-01-01T00:00:00Z"

dandischema/utils.py

Lines changed: 37 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
from __future__ import annotations
22

3+
import copy
34
import re
45
from typing import Any, Dict, Iterator, List, Union, cast, get_args, get_origin
5-
import copy
66

77
from jsonschema import Draft7Validator, Draft202012Validator
88
from jsonschema.protocols import Validator as JsonschemaValidator
@@ -247,10 +247,10 @@ def validate_json(instance: Any, validator: JsonschemaValidator) -> None:
247247
def google_dataset_metadata(metadata: Dict[str, Any]) -> Dict[str, Any]:
248248
"""
249249
Transform DANDI metadata to be compatible with Google Dataset Search.
250-
250+
251251
This function takes a DANDI metadata JSON-LD document and transforms it to ensure
252252
it passes the Google Dataset Search validator by adding or modifying required fields.
253-
253+
254254
Required properties for Google Dataset Search:
255255
- @type: Dataset
256256
- name: The name of the dataset
@@ -260,20 +260,20 @@ def google_dataset_metadata(metadata: Dict[str, Any]) -> Dict[str, Any]:
260260
- version: The version of the dataset
261261
- identifier: An identifier for the dataset (preferably a DOI)
262262
- keywords: Keywords describing the dataset
263-
263+
264264
Parameters
265265
----------
266266
metadata : Dict[str, Any]
267267
The original DANDI metadata JSON-LD document
268-
268+
269269
Returns
270270
-------
271271
Dict[str, Any]
272272
The transformed metadata that is compatible with Google Dataset Search
273273
"""
274274
# Make a deep copy to avoid modifying the original
275275
result = copy.deepcopy(metadata)
276-
276+
277277
# Append schema:Dataset to schemaKey
278278
if "schemaKey" in result:
279279
# If schemaKey is a string, convert it to a list
@@ -294,26 +294,31 @@ def google_dataset_metadata(metadata: Dict[str, Any]) -> Dict[str, Any]:
294294
if "schema:creator" not in result and "contributor" in result:
295295
# Filter contributors with Author role
296296
authors = [
297-
contrib for contrib in result["contributor"]
297+
contrib
298+
for contrib in result["contributor"]
298299
if contrib.get("roleName") and "dcite:Author" in contrib.get("roleName", [])
299300
]
300-
301+
301302
# If no authors found, use all contributors
302303
creators = authors if authors else result["contributor"]
303-
304+
304305
# Format creators according to schema.org requirements
305306
result["schema:creator"] = []
306307
for person in creators:
307308
# Create a new creator object with updated schemaKey
308309
creator = {
309-
"schemaKey": "schema:Organization" if person.get("schemaKey") == "Organization" else "schema:Person",
310-
"name": person.get("name", "")
310+
"schemaKey": (
311+
"schema:Organization"
312+
if person.get("schemaKey") == "Organization"
313+
else "schema:Person"
314+
),
315+
"name": person.get("name", ""),
311316
}
312-
317+
313318
# Add identifier if available (ORCID for Person, ROR for Organization)
314319
if person.get("identifier"):
315320
creator["identifier"] = person["identifier"]
316-
321+
317322
result["schema:creator"].append(creator)
318323

319324
# Update contributor schemaKey and remove roleName
@@ -322,22 +327,22 @@ def google_dataset_metadata(metadata: Dict[str, Any]) -> Dict[str, Any]:
322327
for contributor in result["contributor"]:
323328
# Make a copy of the contributor
324329
updated_contributor = copy.deepcopy(contributor)
325-
330+
326331
# Update schemaKey if it exists
327332
if "schemaKey" in updated_contributor:
328333
if updated_contributor["schemaKey"] == "Person":
329334
updated_contributor["schemaKey"] = "schema:Person"
330335
elif updated_contributor["schemaKey"] == "Organization":
331336
updated_contributor["schemaKey"] = "schema:Organization"
332-
337+
333338
# Remove roleName if it exists
334339
if "roleName" in updated_contributor:
335340
del updated_contributor["roleName"]
336-
341+
337342
updated_contributors.append(updated_contributor)
338-
343+
339344
result["contributor"] = updated_contributors
340-
345+
341346
# Ensure license is properly formatted for schema.org
342347
if "license" in result:
343348
# Transform DANDI license format to schema.org format
@@ -349,62 +354,62 @@ def google_dataset_metadata(metadata: Dict[str, Any]) -> Dict[str, Any]:
349354
schema_licenses.append(f"https://spdx.org/licenses/{license_id}")
350355
else:
351356
schema_licenses.append(license_type)
352-
357+
353358
result["license"] = schema_licenses
354-
359+
355360
# Ensure version is present
356361
if "schemaVersion" in result and "version" not in result:
357362
result["version"] = result["schemaVersion"]
358-
363+
359364
# Ensure identifier is properly formatted (preferably as a DOI URL)
360365
if "identifier" in result and isinstance(result["identifier"], str):
361366
# If it's a DOI in the format "DANDI:123456", convert to a URL
362367
if result["identifier"].startswith("DANDI:"):
363368
dandiset_id = result["identifier"].replace("DANDI:", "")
364369
result["identifier"] = f"https://identifiers.org/DANDI:{dandiset_id}"
365-
370+
366371
# Generate keywords based on available metadata
367372
keywords = []
368-
373+
369374
# Add data standard as keywords
370375
if "assetsSummary" in result and "dataStandard" in result["assetsSummary"]:
371376
for std in result["assetsSummary"]["dataStandard"]:
372377
if "name" in std:
373378
keywords.append(std["name"])
374-
379+
375380
# Add species as keywords
376381
if "assetsSummary" in result and "species" in result["assetsSummary"]:
377382
for species in result["assetsSummary"]["species"]:
378383
if "name" in species:
379384
keywords.append(species["name"])
380-
385+
381386
# Add approach as keywords
382387
if "assetsSummary" in result and "approach" in result["assetsSummary"]:
383388
for approach in result["assetsSummary"]["approach"]:
384389
if "name" in approach:
385390
keywords.append(approach["name"])
386-
391+
387392
# Transform measurement technique into a list of strings and add as keywords
388393
if "assetsSummary" in result and "measurementTechnique" in result["assetsSummary"]:
389394
# Extract technique names for keywords
390395
for technique in result["assetsSummary"]["measurementTechnique"]:
391396
if "name" in technique:
392397
keywords.append(technique["name"])
393-
398+
394399
# Transform the measurementTechnique to a list of strings (names only)
395400
technique_names = []
396401
for technique in result["assetsSummary"]["measurementTechnique"]:
397402
if "name" in technique:
398403
technique_names.append(technique["name"])
399-
404+
400405
# Replace the original complex objects with just the names
401406
if technique_names:
402407
result["assetsSummary"]["measurementTechnique"] = technique_names
403-
408+
404409
# Add "neuroscience" as a default keyword for DANDI
405410
keywords.append("neuroscience")
406411
keywords.append("DANDI")
407-
412+
408413
# Add keywords to result if we generated any
409414
if keywords:
410415
if "keywords" not in result or not result["keywords"]:
@@ -416,13 +421,13 @@ def google_dataset_metadata(metadata: Dict[str, Any]) -> Dict[str, Any]:
416421
if keyword not in existing_keywords:
417422
existing_keywords.append(keyword)
418423
result["keywords"] = existing_keywords
419-
424+
420425
# Add datePublished if available
421426
if "datePublished" in result:
422427
# Ensure it's in the proper format
423428
result["datePublished"] = result["datePublished"]
424429
elif "dateCreated" in result:
425430
# Use dateCreated as a fallback
426431
result["datePublished"] = result["dateCreated"]
427-
432+
428433
return result

0 commit comments

Comments
 (0)