Skip to content

Commit a609970

Browse files
committed
Add initial implementation of py_croissant configuration and transformer functions
1 parent 0f7175a commit a609970

File tree

2 files changed

+186
-0
lines changed

2 files changed

+186
-0
lines changed

examples/py_croissant/config.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
{
2+
"formatName": "py_croissant",
3+
"harvestable": false,
4+
"availableToUsers": true,
5+
"mediaType": "application/json",
6+
"displayName": "Croissant py"
7+
}
Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
def get_bibtex():
2+
identifier = x["datasetJson"]["identifier"]
3+
ore_describes = x["datasetORE"]["ore:describes"]
4+
publication_year = ore_describes["schema:datePublished"][0:4]
5+
6+
creator_array = x["datasetSchemaDotOrg"]["creator"]
7+
creators_formatted = creator_array[0]["name"] if len(creator_array) > 0 else ""
8+
for i in range(1, len(creator_array)):
9+
creators_formatted = creators_formatted + " and " + creator_array[i]["name"]
10+
11+
publisher = x["datasetSchemaDotOrg"]["publisher"]["name"]
12+
title = x["datasetSchemaDotOrg"]["name"]
13+
pid_as_url = ore_describes["@id"]
14+
15+
sb = ""
16+
sb = sb + "@data{" + identifier + "_" + publication_year + ","
17+
sb = sb + "author = {" + creators_formatted + "},"
18+
sb = sb + "publisher = {" + publisher + "},"
19+
sb = sb + "title = {" + title + "},"
20+
sb = sb + "year = {" + publication_year + "},"
21+
sb = sb + "url = {" + pid_as_url + "}"
22+
sb = sb + "}"
23+
return sb
24+
25+
26+
def get_numeric_type(variable_interval_type):
27+
if variable_interval_type == "discrete":
28+
return "sc:Integer"
29+
if variable_interval_type == "contin":
30+
return "sc:Float"
31+
return "sc:Text"
32+
33+
34+
res = {}
35+
36+
context = {}
37+
context["@language"] = "en"
38+
context["@vocab"] = "https://schema.org/"
39+
context["citeAs"] = "cr:citeAs"
40+
context["column"] = "cr:column"
41+
context["conformsTo"] = "dct:conformsTo"
42+
context["cr"] = "http://mlcommons.org/croissant/"
43+
context["rai"] = "http://mlcommons.org/croissant/RAI/"
44+
context["data"] = {"@id": "cr:data", "@type": "@json"}
45+
context["dataType"] = {"@id": "cr:dataType", "@type": "@vocab"}
46+
context["dct"] = "http://purl.org/dc/terms/"
47+
context["examples"] = {"@id": "cr:examples", "@type": "@json"}
48+
context["extract"] = "cr:extract"
49+
context["field"] = "cr:field"
50+
context["fileProperty"] = "cr:fileProperty"
51+
context["fileObject"] = "cr:fileObject"
52+
context["fileSet"] = "cr:fileSet"
53+
context["format"] = "cr:format"
54+
context["includes"] = "cr:includes"
55+
context["isLiveDataset"] = "cr:isLiveDataset"
56+
context["jsonPath"] = "cr:jsonPath"
57+
context["key"] = "cr:key"
58+
context["md5"] = "cr:md5"
59+
context["parentField"] = "cr:parentField"
60+
context["path"] = "cr:path"
61+
context["recordSet"] = "cr:recordSet"
62+
context["references"] = "cr:references"
63+
context["regex"] = "cr:regex"
64+
context["repeated"] = "cr:repeated"
65+
context["replace"] = "cr:replace"
66+
context["sc"] = "https://schema.org/"
67+
context["separator"] = "cr:separator"
68+
context["source"] = "cr:source"
69+
context["subField"] = "cr:subField"
70+
context["transform"] = "cr:transform"
71+
context["wd"] = "https://www.wikidata.org/wiki/"
72+
res["@context"] = context
73+
74+
res["@type"] = "sc:Dataset"
75+
res["conformsTo"] = "http://mlcommons.org/croissant/1.0"
76+
77+
describes = x["datasetORE"]["ore:describes"]
78+
res["name"] = describes["title"]
79+
res["url"] = describes["@id"]
80+
res["creator"] = x["datasetSchemaDotOrg"]["creator"]
81+
res["description"] = x["datasetSchemaDotOrg"]["description"]
82+
res["keywords"] = x["datasetSchemaDotOrg"]["keywords"]
83+
res["license"] = x["datasetSchemaDotOrg"]["license"]
84+
res["datePublished"] = x["datasetSchemaDotOrg"]["datePublished"]
85+
res["dateModified"] = x["datasetSchemaDotOrg"]["dateModified"]
86+
res["includedInDataCatalog"] = x["datasetSchemaDotOrg"]["includedInDataCatalog"]
87+
res["publisher"] = x["datasetSchemaDotOrg"]["publisher"]
88+
res["version"] = describes["schema:version"]
89+
res["citeAs"] = get_bibtex()
90+
91+
funder = x["datasetSchemaDotOrg"].get("funder")
92+
if funder:
93+
res["funder"] = funder
94+
95+
spatial_coverage = x["datasetSchemaDotOrg"].get("spatialCoverage")
96+
if spatial_coverage:
97+
res["spatialCoverage"] = spatial_coverage
98+
99+
ore_files = describes["ore:aggregates"]
100+
distribution = []
101+
record_set = []
102+
103+
for i in range(len(x["datasetFileDetails"])):
104+
file_details = x["datasetFileDetails"][i]
105+
filename = file_details.get("originalFileName")
106+
if not filename:
107+
filename = file_details["filename"]
108+
file_format = file_details.get("originalFileFormat")
109+
if not file_format:
110+
file_format = file_details["contentType"]
111+
file_size = file_details.get("originalFileSize")
112+
if not file_size:
113+
file_size = file_details["filesize"]
114+
115+
checksum = file_details["checksum"]
116+
checksum_type = checksum["type"].lower()
117+
checksum_value = checksum["value"]
118+
file_id = filename
119+
directory_label = ore_files[i].get("dvcore:directoryLabel")
120+
if directory_label:
121+
file_id = directory_label + "/" + filename
122+
123+
dist = {}
124+
dist["@type"] = "cr:FileObject"
125+
dist["@id"] = file_id
126+
dist["name"] = filename
127+
dist["encodingFormat"] = file_format
128+
dist[checksum_type] = checksum_value
129+
dist["contentSize"] = str(file_size)
130+
dist["description"] = file_details.get("description", "")
131+
dist["contentUrl"] = ore_files[i]["schema:sameAs"]
132+
distribution.append(dist)
133+
134+
data_tables = file_details.get("dataTables")
135+
if not data_tables:
136+
data_tables = []
137+
138+
for j in range(len(data_tables)):
139+
data_table_object = data_tables[j]
140+
data_variables = data_table_object["dataVariables"]
141+
field_set_array = []
142+
143+
for k in range(len(data_variables)):
144+
data_variable_object = data_variables[k]
145+
variable_id = str(data_variable_object["id"])
146+
variable_format_type = data_variable_object["variableFormatType"]
147+
variable_interval_type = data_variable_object["variableIntervalType"]
148+
data_type = None
149+
150+
if variable_format_type == "CHARACTER":
151+
data_type = "sc:Text"
152+
elif variable_format_type == "NUMERIC":
153+
data_type = get_numeric_type(variable_interval_type)
154+
155+
field_set = {}
156+
field_set["@type"] = "cr:Field"
157+
field_set["name"] = data_variable_object["name"]
158+
field_set["description"] = data_variable_object["label"]
159+
field_set["dataType"] = data_type
160+
field_set["source"] = {"@id": variable_id, "fileObject": {"@id": file_id}}
161+
field_set_array.append(field_set)
162+
163+
record_set_content = {"@type": "cr:RecordSet"}
164+
record_set_content["field"] = field_set_array
165+
record_set.append(record_set_content)
166+
167+
citation = x["datasetSchemaDotOrg"].get("citation")
168+
if citation:
169+
res["citation"] = citation
170+
171+
temporal_coverage = x["datasetSchemaDotOrg"].get("temporalCoverage")
172+
if temporal_coverage:
173+
res["temporalCoverage"] = temporal_coverage
174+
175+
if len(distribution) != 0:
176+
res["distribution"] = distribution
177+
178+
if len(record_set) != 0:
179+
res["recordSet"] = record_set

0 commit comments

Comments
 (0)