|
| 1 | +import urllib2 |
| 2 | +import json |
| 3 | + |
| 4 | +# Function to fetch file list from dataset |
| 5 | +def get_dataset_files(site_url, dataset_id): |
| 6 | + """Fetch the list of files in the dataset version""" |
| 7 | + try: |
| 8 | + url = site_url + '/api/datasets/' + str(dataset_id) + '/versions/:latest' |
| 9 | + request = urllib2.Request(url) |
| 10 | + request.add_header('Accept', 'application/json') |
| 11 | + response = urllib2.urlopen(request, timeout=30) |
| 12 | + data = json.loads(response.read()) |
| 13 | + |
| 14 | + if data.get('status') == 'OK' and 'data' in data: |
| 15 | + return data['data'].get('files', []) |
| 16 | + return [] |
| 17 | + except: |
| 18 | + return [] |
| 19 | + |
| 20 | +# Function to download CDI file content |
| 21 | +def get_cdi_file_content(site_url, file_id): |
| 22 | + """Download the content of a CDI file""" |
| 23 | + try: |
| 24 | + url = site_url + '/api/access/datafile/' + str(file_id) |
| 25 | + request = urllib2.Request(url) |
| 26 | + response = urllib2.urlopen(request, timeout=30) |
| 27 | + content = response.read() |
| 28 | + # Try to parse as JSON to validate |
| 29 | + parsed = json.loads(content) |
| 30 | + return parsed |
| 31 | + except: |
| 32 | + return None |
| 33 | + |
| 34 | +# Try to find and return existing CDI file |
| 35 | +def find_cdi_file(): |
| 36 | + """Find the latest CDI file with application/ld+json MIME type""" |
| 37 | + |
| 38 | + # Extract site URL and dataset ID from the input data |
| 39 | + site_url = None |
| 40 | + dataset_id = None |
| 41 | + |
| 42 | + # Try to get site URL from ORE export |
| 43 | + if 'datasetORE' in x and 'ore:describes' in x['datasetORE']: |
| 44 | + describes = x['datasetORE']['ore:describes'] |
| 45 | + if '@id' in describes: |
| 46 | + dataset_url = describes['@id'] |
| 47 | + # Extract base URL (everything before /dataset.xhtml or /api) |
| 48 | + if '/dataset.xhtml' in dataset_url: |
| 49 | + site_url = dataset_url.split('/dataset.xhtml')[0] |
| 50 | + elif '/citation' in dataset_url: |
| 51 | + site_url = dataset_url.split('/citation')[0] |
| 52 | + |
| 53 | + # Try to get dataset ID from datasetJson |
| 54 | + if 'datasetJson' in x and 'id' in x['datasetJson']: |
| 55 | + dataset_id = x['datasetJson']['id'] |
| 56 | + |
| 57 | + if not site_url or not dataset_id: |
| 58 | + return None |
| 59 | + |
| 60 | + # Get list of files |
| 61 | + files = get_dataset_files(site_url, dataset_id) |
| 62 | + |
| 63 | + # Find CDI files (application/ld+json with DDI-CDI profile MIME type) |
| 64 | + cdi_files = [] |
| 65 | + for file_info in files: |
| 66 | + datafile = file_info.get('dataFile', {}) |
| 67 | + content_type = datafile.get('contentType', '') |
| 68 | + |
| 69 | + # Check for application/ld+json with DDI-CDI profile |
| 70 | + # Accept both the full profile and just application/ld+json for now (transition period) |
| 71 | + is_cdi_mime = ( |
| 72 | + 'ddialliance.org/Specification/DDI-CDI' in content_type or |
| 73 | + (content_type == 'application/ld+json' and datafile.get('filename', '').endswith('.jsonld')) |
| 74 | + ) |
| 75 | + |
| 76 | + if is_cdi_mime: |
| 77 | + # Check if it has .jsonld extension |
| 78 | + filename = datafile.get('filename', '') |
| 79 | + if filename.endswith('.jsonld'): |
| 80 | + cdi_files.append({ |
| 81 | + 'id': datafile.get('id'), |
| 82 | + 'filename': filename, |
| 83 | + 'createDate': datafile.get('createDate', '') |
| 84 | + }) |
| 85 | + |
| 86 | + # Sort by creation date (newest first) and get the most recent |
| 87 | + if cdi_files: |
| 88 | + cdi_files.sort(key=lambda f: f.get('createDate', ''), reverse=True) |
| 89 | + latest_file = cdi_files[0] |
| 90 | + |
| 91 | + # Download and return the content |
| 92 | + content = get_cdi_file_content(site_url, latest_file['id']) |
| 93 | + if content: |
| 94 | + return content |
| 95 | + |
| 96 | + return None |
| 97 | + |
| 98 | +# Try to get existing CDI file first |
| 99 | +existing_cdi = find_cdi_file() |
| 100 | +if existing_cdi: |
| 101 | + res = existing_cdi |
| 102 | +else: |
| 103 | + # Generate CDI JSON-LD from dataset metadata |
| 104 | + res = {} |
| 105 | + |
| 106 | + # Set up JSON-LD context |
| 107 | + context = {} |
| 108 | + context['@vocab'] = 'https://ddialliance.org/Specification/DDI-CDI/1.0/RDF/' |
| 109 | + context['ddi'] = 'https://ddialliance.org/Specification/DDI-CDI/1.0/RDF/' |
| 110 | + context['xsd'] = 'http://www.w3.org/2001/XMLSchema#' |
| 111 | + context['rdf'] = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' |
| 112 | + context['rdfs'] = 'http://www.w3.org/2000/01/rdf-schema#' |
| 113 | + context['skos'] = 'http://www.w3.org/2004/02/skos/core#' |
| 114 | + res['@context'] = context |
| 115 | + |
| 116 | + # Use @graph for flattened JSON-LD structure |
| 117 | + graph = [] |
| 118 | + |
| 119 | + # Get basic dataset information |
| 120 | + dataset_json = x.get('datasetJson', {}) |
| 121 | + schema_org = x.get('datasetSchemaDotOrg', {}) |
| 122 | + ore_data = x.get('datasetORE', {}) |
| 123 | + |
| 124 | + # Create Dataset Description |
| 125 | + dataset_description = {} |
| 126 | + dataset_description['@type'] = 'DatasetDescription' |
| 127 | + |
| 128 | + # Generate dataset ID |
| 129 | + dataset_id = dataset_json.get('identifier') |
| 130 | + if dataset_id: |
| 131 | + dataset_description['@id'] = 'dataset-' + str(dataset_id) |
| 132 | + else: |
| 133 | + dataset_description['@id'] = 'dataset-1' |
| 134 | + |
| 135 | + # Add title |
| 136 | + if 'name' in schema_org: |
| 137 | + dataset_description['name'] = schema_org['name'] |
| 138 | + elif 'datasetVersion' in dataset_json: |
| 139 | + metadata = dataset_json['datasetVersion'].get('metadataBlocks', {}) |
| 140 | + if 'citation' in metadata and 'fields' in metadata['citation']: |
| 141 | + for field in metadata['citation']['fields']: |
| 142 | + if field.get('typeName') == 'title': |
| 143 | + dataset_description['name'] = field.get('value', '') |
| 144 | + break |
| 145 | + |
| 146 | + # Add description |
| 147 | + if 'description' in schema_org: |
| 148 | + if isinstance(schema_org['description'], list) and len(schema_org['description']) > 0: |
| 149 | + dataset_description['description'] = schema_org['description'][0] |
| 150 | + elif isinstance(schema_org['description'], str): |
| 151 | + dataset_description['description'] = schema_org['description'] |
| 152 | + |
| 153 | + # Add identifier (DOI/Handle) |
| 154 | + if 'identifier' in schema_org: |
| 155 | + dataset_description['identifier'] = schema_org['identifier'] |
| 156 | + elif 'ore:describes' in ore_data: |
| 157 | + describes = ore_data['ore:describes'] |
| 158 | + if '@id' in describes: |
| 159 | + dataset_description['identifier'] = describes['@id'] |
| 160 | + |
| 161 | + # Add publication date |
| 162 | + if 'datePublished' in schema_org: |
| 163 | + dataset_description['datePublished'] = schema_org['datePublished'] |
| 164 | + |
| 165 | + # Add creators/authors |
| 166 | + creators = [] |
| 167 | + if 'creator' in schema_org: |
| 168 | + for creator_obj in schema_org['creator']: |
| 169 | + creator = {} |
| 170 | + creator['@type'] = 'Individual' |
| 171 | + if 'name' in creator_obj: |
| 172 | + creator['name'] = creator_obj['name'] |
| 173 | + if '@id' in creator_obj: |
| 174 | + creator['@id'] = creator_obj['@id'] |
| 175 | + elif 'name' in creator_obj: |
| 176 | + # Generate ID from name |
| 177 | + creator['@id'] = 'creator-' + creator_obj['name'].replace(' ', '-').lower() |
| 178 | + |
| 179 | + # Add affiliation if available |
| 180 | + if 'affiliation' in creator_obj: |
| 181 | + affiliation = {} |
| 182 | + affiliation['@type'] = 'Organization' |
| 183 | + if isinstance(creator_obj['affiliation'], dict): |
| 184 | + affiliation['name'] = creator_obj['affiliation'].get('name', '') |
| 185 | + else: |
| 186 | + affiliation['name'] = creator_obj['affiliation'] |
| 187 | + creator['affiliation'] = affiliation |
| 188 | + |
| 189 | + creators.append(creator) |
| 190 | + |
| 191 | + if creators: |
| 192 | + dataset_description['creators'] = creators |
| 193 | + |
| 194 | + # Add keywords/subjects |
| 195 | + if 'keywords' in schema_org and schema_org['keywords']: |
| 196 | + dataset_description['keywords'] = schema_org['keywords'] |
| 197 | + |
| 198 | + # Add license |
| 199 | + if 'license' in schema_org: |
| 200 | + license_info = schema_org['license'] |
| 201 | + if isinstance(license_info, dict): |
| 202 | + dataset_description['license'] = license_info.get('url') or license_info.get('@id') or license_info.get('name') |
| 203 | + else: |
| 204 | + dataset_description['license'] = license_info |
| 205 | + |
| 206 | + # Add publisher |
| 207 | + if 'publisher' in schema_org: |
| 208 | + publisher = {} |
| 209 | + publisher['@type'] = 'Organization' |
| 210 | + if isinstance(schema_org['publisher'], dict): |
| 211 | + publisher['name'] = schema_org['publisher'].get('name', '') |
| 212 | + if 'url' in schema_org['publisher']: |
| 213 | + publisher['url'] = schema_org['publisher']['url'] |
| 214 | + else: |
| 215 | + publisher['name'] = schema_org['publisher'] |
| 216 | + dataset_description['publisher'] = publisher |
| 217 | + |
| 218 | + graph.append(dataset_description) |
| 219 | + |
| 220 | + # Process data files and create DataStore entries |
| 221 | + file_details = x.get('datasetFileDetails', []) |
| 222 | + if file_details: |
| 223 | + for idx, file_info in enumerate(file_details): |
| 224 | + # Create DataStore for each file |
| 225 | + datastore = {} |
| 226 | + datastore['@type'] = 'DataStore' |
| 227 | + datastore['@id'] = 'datastore-' + str(idx + 1) |
| 228 | + |
| 229 | + # Get filename |
| 230 | + filename = file_info.get('originalFileName') or file_info.get('filename', '') |
| 231 | + datastore['name'] = filename |
| 232 | + |
| 233 | + # Add description if available |
| 234 | + if 'description' in file_info: |
| 235 | + datastore['description'] = file_info['description'] |
| 236 | + |
| 237 | + # Add file format |
| 238 | + file_format = file_info.get('originalFileFormat') or file_info.get('contentType', '') |
| 239 | + if file_format: |
| 240 | + datastore['format'] = file_format |
| 241 | + |
| 242 | + # Add file size |
| 243 | + file_size = file_info.get('originalFileSize') or file_info.get('filesize') |
| 244 | + if file_size: |
| 245 | + datastore['size'] = str(file_size) |
| 246 | + |
| 247 | + # Add checksum |
| 248 | + if 'checksum' in file_info: |
| 249 | + checksum = file_info['checksum'] |
| 250 | + datastore['checksum'] = { |
| 251 | + 'algorithm': checksum.get('type', ''), |
| 252 | + 'value': checksum.get('value', '') |
| 253 | + } |
| 254 | + |
| 255 | + graph.append(datastore) |
| 256 | + |
| 257 | + # Process variables if available |
| 258 | + data_tables = file_info.get('dataTables', []) |
| 259 | + for table_idx, data_table in enumerate(data_tables): |
| 260 | + data_variables = data_table.get('dataVariables', []) |
| 261 | + |
| 262 | + for var_idx, var_info in enumerate(data_variables): |
| 263 | + # Create Variable |
| 264 | + variable = {} |
| 265 | + variable['@type'] = 'Variable' |
| 266 | + variable['@id'] = 'variable-' + str(var_info.get('id', str(idx) + '-' + str(var_idx))) |
| 267 | + variable['name'] = var_info.get('name', '') |
| 268 | + |
| 269 | + # Add label (description) |
| 270 | + if 'label' in var_info: |
| 271 | + variable['label'] = var_info['label'] |
| 272 | + |
| 273 | + # Add variable type information |
| 274 | + var_format = var_info.get('variableFormatType', '') |
| 275 | + var_interval = var_info.get('variableIntervalType', '') |
| 276 | + |
| 277 | + if var_format == 'CHARACTER': |
| 278 | + variable['dataType'] = 'string' |
| 279 | + elif var_format == 'NUMERIC': |
| 280 | + if var_interval == 'discrete': |
| 281 | + variable['dataType'] = 'integer' |
| 282 | + elif var_interval == 'contin': |
| 283 | + variable['dataType'] = 'float' |
| 284 | + else: |
| 285 | + variable['dataType'] = 'numeric' |
| 286 | + |
| 287 | + # Add measurement unit if available |
| 288 | + if 'unf' in var_info: |
| 289 | + variable['fingerprint'] = var_info['unf'] |
| 290 | + |
| 291 | + # Link to datastore |
| 292 | + variable['sourceDataStore'] = {'@id': 'datastore-' + str(idx + 1)} |
| 293 | + |
| 294 | + graph.append(variable) |
| 295 | + |
| 296 | + # Add DataSet entry that ties everything together |
| 297 | + dataset = {} |
| 298 | + dataset['@type'] = 'DataSet' |
| 299 | + dataset['@id'] = 'dataset' |
| 300 | + dataset['describes'] = {'@id': dataset_description['@id']} |
| 301 | + |
| 302 | + # Link to data stores |
| 303 | + datastore_refs = [{'@id': item['@id']} for item in graph if item.get('@type') == 'DataStore'] |
| 304 | + if datastore_refs: |
| 305 | + dataset['hasDataStores'] = datastore_refs |
| 306 | + |
| 307 | + graph.append(dataset) |
| 308 | + |
| 309 | + # Set the graph |
| 310 | + res['@graph'] = graph |
0 commit comments