Skip to content

Commit d540480

Browse files
committed
Add DDI-CDI metadata export functionality and configuration files
1 parent 75a1a52 commit d540480

File tree

3 files changed

+331
-0
lines changed

3 files changed

+331
-0
lines changed

README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,20 @@ This exporter is entirely based on the [DDI PDF Exporter](https://github.com/gdc
110110

111111
This exporter is entirely based on the [Dataverse PR 10086](https://github.com/IQSS/dataverse/pull/10086). It is simply a port of that exporter into Python (Jython).
112112

113+
### DDI-CDI (Cross Domain Integration)
114+
115+
This exporter provides DDI-CDI metadata export in two modes:
116+
1. **Primary mode**: Finds and exports the latest `.jsonld` file with the DDI-CDI profile MIME type (`application/ld+json; profile="..."` with DDI-CDI specification) from the dataset. This preserves any manually created or uploaded CDI metadata files.
117+
2. **Fallback mode**: When no CDI file exists, generates rich DDI-CDI JSON-LD from the dataset metadata, including:
118+
- Dataset description with title, creators, keywords, license
119+
- DataStore entries for each file with checksums
120+
- Variable definitions with proper data types
121+
- Proper DDI-CDI ontology structure with `@graph` and typed entities
122+
123+
The exporter is written in Python (Jython) and makes public API calls to check for existing CDI files before falling back to generation. It uses the specific MIME type with DDI-CDI profile to distinguish CDI files from other JSON-LD files, making it ideal for datasets that may have curated CDI metadata while providing automatic generation for others.
124+
125+
**MIME Type:** `application/ld+json; profile="http://www.w3.org/ns/json-ld#flattened http://www.w3.org/ns/json-ld#compacted https://ddialliance.org/Specification/DDI-CDI/1.0"`
126+
113127
## Developer guide
114128

115129
The easiest way to start is to write JavasCript code. You can use the provided [Croissant](/examples/croissant/js/croissant.js) code as the start point. You will need to restart the server after changing that code. Note that the exporters use caching, you will need to either to wait until the cache is expired or delete the cached exporter output manually to see the changes.

examples/cdi-exporter/config.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
{
2+
"formatName": "cdi",
3+
"harvestable": true,
4+
"availableToUsers": true,
5+
"mediaType": "application/ld+json; profile=\"http://www.w3.org/ns/json-ld#flattened http://www.w3.org/ns/json-ld#compacted https://ddialliance.org/Specification/DDI-CDI/1.0\"",
6+
"displayName": "DDI-CDI (Cross Domain Integration)"
7+
}
Lines changed: 310 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,310 @@
1+
import urllib2
2+
import json
3+
4+
# Function to fetch file list from dataset
5+
def get_dataset_files(site_url, dataset_id):
6+
"""Fetch the list of files in the dataset version"""
7+
try:
8+
url = site_url + '/api/datasets/' + str(dataset_id) + '/versions/:latest'
9+
request = urllib2.Request(url)
10+
request.add_header('Accept', 'application/json')
11+
response = urllib2.urlopen(request, timeout=30)
12+
data = json.loads(response.read())
13+
14+
if data.get('status') == 'OK' and 'data' in data:
15+
return data['data'].get('files', [])
16+
return []
17+
except:
18+
return []
19+
20+
# Function to download CDI file content
21+
def get_cdi_file_content(site_url, file_id):
22+
"""Download the content of a CDI file"""
23+
try:
24+
url = site_url + '/api/access/datafile/' + str(file_id)
25+
request = urllib2.Request(url)
26+
response = urllib2.urlopen(request, timeout=30)
27+
content = response.read()
28+
# Try to parse as JSON to validate
29+
parsed = json.loads(content)
30+
return parsed
31+
except:
32+
return None
33+
34+
# Try to find and return existing CDI file
35+
def find_cdi_file():
36+
"""Find the latest CDI file with application/ld+json MIME type"""
37+
38+
# Extract site URL and dataset ID from the input data
39+
site_url = None
40+
dataset_id = None
41+
42+
# Try to get site URL from ORE export
43+
if 'datasetORE' in x and 'ore:describes' in x['datasetORE']:
44+
describes = x['datasetORE']['ore:describes']
45+
if '@id' in describes:
46+
dataset_url = describes['@id']
47+
# Extract base URL (everything before /dataset.xhtml or /api)
48+
if '/dataset.xhtml' in dataset_url:
49+
site_url = dataset_url.split('/dataset.xhtml')[0]
50+
elif '/citation' in dataset_url:
51+
site_url = dataset_url.split('/citation')[0]
52+
53+
# Try to get dataset ID from datasetJson
54+
if 'datasetJson' in x and 'id' in x['datasetJson']:
55+
dataset_id = x['datasetJson']['id']
56+
57+
if not site_url or not dataset_id:
58+
return None
59+
60+
# Get list of files
61+
files = get_dataset_files(site_url, dataset_id)
62+
63+
# Find CDI files (application/ld+json with DDI-CDI profile MIME type)
64+
cdi_files = []
65+
for file_info in files:
66+
datafile = file_info.get('dataFile', {})
67+
content_type = datafile.get('contentType', '')
68+
69+
# Check for application/ld+json with DDI-CDI profile
70+
# Accept both the full profile and just application/ld+json for now (transition period)
71+
is_cdi_mime = (
72+
'ddialliance.org/Specification/DDI-CDI' in content_type or
73+
(content_type == 'application/ld+json' and datafile.get('filename', '').endswith('.jsonld'))
74+
)
75+
76+
if is_cdi_mime:
77+
# Check if it has .jsonld extension
78+
filename = datafile.get('filename', '')
79+
if filename.endswith('.jsonld'):
80+
cdi_files.append({
81+
'id': datafile.get('id'),
82+
'filename': filename,
83+
'createDate': datafile.get('createDate', '')
84+
})
85+
86+
# Sort by creation date (newest first) and get the most recent
87+
if cdi_files:
88+
cdi_files.sort(key=lambda f: f.get('createDate', ''), reverse=True)
89+
latest_file = cdi_files[0]
90+
91+
# Download and return the content
92+
content = get_cdi_file_content(site_url, latest_file['id'])
93+
if content:
94+
return content
95+
96+
return None
97+
98+
# Try to get existing CDI file first
99+
existing_cdi = find_cdi_file()
100+
if existing_cdi:
101+
res = existing_cdi
102+
else:
103+
# Generate CDI JSON-LD from dataset metadata
104+
res = {}
105+
106+
# Set up JSON-LD context
107+
context = {}
108+
context['@vocab'] = 'https://ddialliance.org/Specification/DDI-CDI/1.0/RDF/'
109+
context['ddi'] = 'https://ddialliance.org/Specification/DDI-CDI/1.0/RDF/'
110+
context['xsd'] = 'http://www.w3.org/2001/XMLSchema#'
111+
context['rdf'] = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
112+
context['rdfs'] = 'http://www.w3.org/2000/01/rdf-schema#'
113+
context['skos'] = 'http://www.w3.org/2004/02/skos/core#'
114+
res['@context'] = context
115+
116+
# Use @graph for flattened JSON-LD structure
117+
graph = []
118+
119+
# Get basic dataset information
120+
dataset_json = x.get('datasetJson', {})
121+
schema_org = x.get('datasetSchemaDotOrg', {})
122+
ore_data = x.get('datasetORE', {})
123+
124+
# Create Dataset Description
125+
dataset_description = {}
126+
dataset_description['@type'] = 'DatasetDescription'
127+
128+
# Generate dataset ID
129+
dataset_id = dataset_json.get('identifier')
130+
if dataset_id:
131+
dataset_description['@id'] = 'dataset-' + str(dataset_id)
132+
else:
133+
dataset_description['@id'] = 'dataset-1'
134+
135+
# Add title
136+
if 'name' in schema_org:
137+
dataset_description['name'] = schema_org['name']
138+
elif 'datasetVersion' in dataset_json:
139+
metadata = dataset_json['datasetVersion'].get('metadataBlocks', {})
140+
if 'citation' in metadata and 'fields' in metadata['citation']:
141+
for field in metadata['citation']['fields']:
142+
if field.get('typeName') == 'title':
143+
dataset_description['name'] = field.get('value', '')
144+
break
145+
146+
# Add description
147+
if 'description' in schema_org:
148+
if isinstance(schema_org['description'], list) and len(schema_org['description']) > 0:
149+
dataset_description['description'] = schema_org['description'][0]
150+
elif isinstance(schema_org['description'], str):
151+
dataset_description['description'] = schema_org['description']
152+
153+
# Add identifier (DOI/Handle)
154+
if 'identifier' in schema_org:
155+
dataset_description['identifier'] = schema_org['identifier']
156+
elif 'ore:describes' in ore_data:
157+
describes = ore_data['ore:describes']
158+
if '@id' in describes:
159+
dataset_description['identifier'] = describes['@id']
160+
161+
# Add publication date
162+
if 'datePublished' in schema_org:
163+
dataset_description['datePublished'] = schema_org['datePublished']
164+
165+
# Add creators/authors
166+
creators = []
167+
if 'creator' in schema_org:
168+
for creator_obj in schema_org['creator']:
169+
creator = {}
170+
creator['@type'] = 'Individual'
171+
if 'name' in creator_obj:
172+
creator['name'] = creator_obj['name']
173+
if '@id' in creator_obj:
174+
creator['@id'] = creator_obj['@id']
175+
elif 'name' in creator_obj:
176+
# Generate ID from name
177+
creator['@id'] = 'creator-' + creator_obj['name'].replace(' ', '-').lower()
178+
179+
# Add affiliation if available
180+
if 'affiliation' in creator_obj:
181+
affiliation = {}
182+
affiliation['@type'] = 'Organization'
183+
if isinstance(creator_obj['affiliation'], dict):
184+
affiliation['name'] = creator_obj['affiliation'].get('name', '')
185+
else:
186+
affiliation['name'] = creator_obj['affiliation']
187+
creator['affiliation'] = affiliation
188+
189+
creators.append(creator)
190+
191+
if creators:
192+
dataset_description['creators'] = creators
193+
194+
# Add keywords/subjects
195+
if 'keywords' in schema_org and schema_org['keywords']:
196+
dataset_description['keywords'] = schema_org['keywords']
197+
198+
# Add license
199+
if 'license' in schema_org:
200+
license_info = schema_org['license']
201+
if isinstance(license_info, dict):
202+
dataset_description['license'] = license_info.get('url') or license_info.get('@id') or license_info.get('name')
203+
else:
204+
dataset_description['license'] = license_info
205+
206+
# Add publisher
207+
if 'publisher' in schema_org:
208+
publisher = {}
209+
publisher['@type'] = 'Organization'
210+
if isinstance(schema_org['publisher'], dict):
211+
publisher['name'] = schema_org['publisher'].get('name', '')
212+
if 'url' in schema_org['publisher']:
213+
publisher['url'] = schema_org['publisher']['url']
214+
else:
215+
publisher['name'] = schema_org['publisher']
216+
dataset_description['publisher'] = publisher
217+
218+
graph.append(dataset_description)
219+
220+
# Process data files and create DataStore entries
221+
file_details = x.get('datasetFileDetails', [])
222+
if file_details:
223+
for idx, file_info in enumerate(file_details):
224+
# Create DataStore for each file
225+
datastore = {}
226+
datastore['@type'] = 'DataStore'
227+
datastore['@id'] = 'datastore-' + str(idx + 1)
228+
229+
# Get filename
230+
filename = file_info.get('originalFileName') or file_info.get('filename', '')
231+
datastore['name'] = filename
232+
233+
# Add description if available
234+
if 'description' in file_info:
235+
datastore['description'] = file_info['description']
236+
237+
# Add file format
238+
file_format = file_info.get('originalFileFormat') or file_info.get('contentType', '')
239+
if file_format:
240+
datastore['format'] = file_format
241+
242+
# Add file size
243+
file_size = file_info.get('originalFileSize') or file_info.get('filesize')
244+
if file_size:
245+
datastore['size'] = str(file_size)
246+
247+
# Add checksum
248+
if 'checksum' in file_info:
249+
checksum = file_info['checksum']
250+
datastore['checksum'] = {
251+
'algorithm': checksum.get('type', ''),
252+
'value': checksum.get('value', '')
253+
}
254+
255+
graph.append(datastore)
256+
257+
# Process variables if available
258+
data_tables = file_info.get('dataTables', [])
259+
for table_idx, data_table in enumerate(data_tables):
260+
data_variables = data_table.get('dataVariables', [])
261+
262+
for var_idx, var_info in enumerate(data_variables):
263+
# Create Variable
264+
variable = {}
265+
variable['@type'] = 'Variable'
266+
variable['@id'] = 'variable-' + str(var_info.get('id', str(idx) + '-' + str(var_idx)))
267+
variable['name'] = var_info.get('name', '')
268+
269+
# Add label (description)
270+
if 'label' in var_info:
271+
variable['label'] = var_info['label']
272+
273+
# Add variable type information
274+
var_format = var_info.get('variableFormatType', '')
275+
var_interval = var_info.get('variableIntervalType', '')
276+
277+
if var_format == 'CHARACTER':
278+
variable['dataType'] = 'string'
279+
elif var_format == 'NUMERIC':
280+
if var_interval == 'discrete':
281+
variable['dataType'] = 'integer'
282+
elif var_interval == 'contin':
283+
variable['dataType'] = 'float'
284+
else:
285+
variable['dataType'] = 'numeric'
286+
287+
# Add measurement unit if available
288+
if 'unf' in var_info:
289+
variable['fingerprint'] = var_info['unf']
290+
291+
# Link to datastore
292+
variable['sourceDataStore'] = {'@id': 'datastore-' + str(idx + 1)}
293+
294+
graph.append(variable)
295+
296+
# Add DataSet entry that ties everything together
297+
dataset = {}
298+
dataset['@type'] = 'DataSet'
299+
dataset['@id'] = 'dataset'
300+
dataset['describes'] = {'@id': dataset_description['@id']}
301+
302+
# Link to data stores
303+
datastore_refs = [{'@id': item['@id']} for item in graph if item.get('@type') == 'DataStore']
304+
if datastore_refs:
305+
dataset['hasDataStores'] = datastore_refs
306+
307+
graph.append(dataset)
308+
309+
# Set the graph
310+
res['@graph'] = graph

0 commit comments

Comments
 (0)