Skip to content

Commit c6b8cb1

Browse files
authored
Merge pull request #82 from jyucsiro/test-schema-org
Test schema org
2 parents 49391ff + c07f764 commit c6b8cb1

File tree

4 files changed

+135
-6
lines changed

4 files changed

+135
-6
lines changed

lib/bald/__init__.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -675,8 +675,10 @@ def load(afilepath):
675675
loader = netCDF4.Dataset
676676
else:
677677
raise ValueError('filepath suffix not supported: {}'.format(afilepath))
678-
if not os.path.exists(afilepath):
679-
raise IOError('{} not found'.format(afilepath))
678+
#Disable this check for now to allow URL input
679+
#TODO: Add feature to check both local files and files on the web, e.g. URLs
680+
#if not os.path.exists(afilepath):
681+
# raise IOError('{} not found'.format(afilepath))
680682
try:
681683
f = loader(afilepath, "r")
682684
yield f

nc2rdf/README.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,4 +31,19 @@ $ python nc2rdf.py -o ttl myfile.nc
3131
$ python nc2rdf.py -o xml myfile.nc
3232
```
3333

34+
## nc2schemaorg
35+
36+
This feature provides users a way to create schema.org descriptions from
37+
ACDD/CF/NUG conformant values in a nc file.
38+
39+
```
40+
$ python nc2rdf.py -o json-ld --schema-org [cdl or nc file]
41+
```
42+
43+
Example:
44+
```
45+
$ python nc2rdf.py -o json-ld --schema-org ../lib/bald/tests/integration/CDL/trajectoryProfile_template.cdl
46+
```
47+
48+
3449
Note: This command-line tool is experimental and is subject to changes, however serves as a prototype for accessing bald functions for netCDF related files to RDF.

nc2rdf/bald2schemaorg_mappings.json

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
[
2+
{ "bald" : "summary", "schemaorg": "description" },
3+
{ "bald" : "title", "schemaorg": "name" },
4+
{ "bald" : "id", "schemaorg": "identifier" },
5+
{ "bald" : "keywords", "schemaorg": "keywords" },
6+
{ "bald" : "license", "schemaorg": "license" },
7+
{ "bald" : "standard_name", "schemaorg": "variableMeasured" }
8+
]

nc2rdf/nc2rdf.py

Lines changed: 108 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,111 @@
55
import netCDF4
66
import numpy as np
77
import bald
8+
import rdflib
9+
import json
10+
from rdflib import Namespace, BNode, URIRef, Literal
11+
from rdflib.namespace import RDF
12+
try:
13+
# python 3
14+
from urllib.parse import urlparse
15+
except ImportError:
16+
from urlparse import urlparse
17+
18+
def isUrl(url):
19+
try:
20+
result = urlparse(url)
21+
if all([result.scheme, result.netloc, result.path]) and (result.scheme == 'https' or result.scheme == 'http'):
22+
return True
23+
except:
24+
return False
25+
26+
def getBasename(urlstr):
27+
return os.path.basename(urlstr)
28+
29+
def baldgraph2schemaorg(graph, path=None, baseuri=None):
30+
"""
31+
Input: netCDF file
32+
Transforms to a rdflib.Graph bald style
33+
Returns a new graph in schema.org profile
34+
"""
35+
# HACK: The following mappings ignore prefixes as well as prefixes in nc file
36+
# TODO: Fix references to prefixes/aliases proper
37+
38+
#load mappings
39+
mapping_idx = {}
40+
mapping_data = []
41+
with open('bald2schemaorg_mappings.json' , 'r') as f:
42+
mapping_data = json.load(f)
43+
44+
for item in mapping_data:
45+
mapping_idx[item['bald']] = item['schemaorg']
46+
47+
qres = graph.query(
48+
"""PREFIX bald: <http://binary-array-ld.net/latest/>
49+
SELECT DISTINCT ?pred ?value
50+
WHERE {
51+
?c a bald:Container .
52+
?c ?pred ?value
53+
}""")
54+
55+
schema_g = rdflib.Graph()
56+
57+
if baseuri is not None:
58+
container = URIRef(baseuri)
59+
else:
60+
container = BNode()
61+
62+
so = Namespace("http://schema.org/")
63+
schema_g.add( (container, URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), so.Dataset) )
64+
65+
if path is not None and isUrl(path):
66+
predUri = URIRef("http://schema.org/url")
67+
schema_g.add( (container, predUri, URIRef(path)) )
68+
69+
for row in qres:
70+
currField = getBasename(str(row[0])).strip()
71+
#print(getBasename(str(row[0])) + ' (type: ' + str(type(row[0])) + ")" + " :: " + row[1] + ' (type: ' + str(type(row[1])) + ")")
72+
if(currField in mapping_idx.keys()):
73+
predUri = URIRef("http://schema.org/" + mapping_idx[currField])
74+
if currField == 'keywords':
75+
for x in row[1].split(','):
76+
kw = x.strip()
77+
if len(kw) == 0:
78+
continue
79+
lit = Literal(kw)
80+
schema_g.add( (container, predUri, lit) )
81+
continue
82+
83+
#print('schemaorg:' + mapping_idx[currField], "\t", row[1])
84+
lit = Literal(row[1])
85+
schema_g.add( (container, predUri, lit) )
86+
return schema_g
87+
88+
def nc2schemaorg(ncfilename, outformat, baseuri=None):
89+
root_container = bald.load_netcdf(ncfilename, baseuri=baseuri)
90+
graph = root_container.rdfgraph()
91+
schema_g = baldgraph2schemaorg(graph, path=ncfilename, baseuri=baseuri)
92+
93+
if(outformat == 'json-ld'):
94+
context = "http://schema.org/"
95+
s = schema_g.serialize(format=outformat, context=context, indent=4).decode("utf-8")
96+
else:
97+
s = schema_g.serialize(format=outformat).decode("utf-8")
98+
print(s)
899

9100
def nc2rdf(ncfilename, outformat, baseuri=None):
10-
#print("nc2rdf test")
11-
#print(ncfile)
12101
root_container = bald.load_netcdf(ncfilename, baseuri=baseuri)
13102
ttl = root_container.rdfgraph().serialize(format=outformat).decode("utf-8")
14103
print(ttl)
15104

105+
def cdl2schemaorg(cdl_file, outformat, baseuri=None):
106+
tfile, tfilename = tempfile.mkstemp('.nc')
107+
subprocess.check_call(['ncgen', '-o', tfilename, cdl_file])
108+
schema_g = nc2schemaorg(tfilename, outformat, baseuri=baseuri)
109+
os.close(tfile)
110+
os.remove(tfilename)
111+
return schema_g
112+
16113
def cdl2rdf(cdl_file, outformat, baseuri=None):
17114
#print("cdl2rdf test")
18115
#print(cdl_file)
@@ -32,13 +129,20 @@ def cdl2rdf(cdl_file, outformat, baseuri=None):
32129
parser.add_argument('--baseuri', action="store", dest="baseuri", help="Base URI for the graph")
33130
parser.add_argument('--cdl', action="store_true", dest="isCDL", default=False, help="Flag to indicate file is CDL")
34131
parser.add_argument('--nc', action="store_true", dest="isNC", default=False, help="Flag to indicate file is netCDF")
132+
parser.add_argument('--schema-org', action="store_true", dest="isSchemaOrgOutput", default=False, help="Flag to indicate if schema.org output activated")
35133
parser.add_argument("ncfile", help="Path for the netCDF file")
36134

37135
args = parser.parse_args()
38136

39137
if(args.isCDL or args.ncfile.endswith(".cdl") or args.ncfile.endswith('.CDL')):
40-
cdl2rdf(args.ncfile, args.format, baseuri=args.baseuri)
138+
if(args.isSchemaOrgOutput):
139+
cdl2schemaorg(args.ncfile, args.format, baseuri=args.baseuri)
140+
else:
141+
cdl2rdf(args.ncfile, args.format, baseuri=args.baseuri)
41142
elif(args.isNC or args.ncfile.endswith(".nc") or args.ncfile.endswith('.NC')):
42-
nc2rdf(args.ncfile, args.format, baseuri=args.baseuri)
143+
if(args.isSchemaOrgOutput):
144+
nc2schemaorg(args.ncfile, args.format, baseuri=args.baseuri)
145+
else:
146+
nc2rdf(args.ncfile, args.format, baseuri=args.baseuri)
43147
else:
44148
print("Unrecognised file suffix. Please indicate if CDL or NC via --cdl or --nc");

0 commit comments

Comments
 (0)