Skip to content

Commit de0935f

Browse files
committed
alias_graph initial
1 parent 608acc1 commit de0935f

File tree

3 files changed

+343
-232
lines changed

3 files changed

+343
-232
lines changed

lib/bald/__init__.py

Lines changed: 154 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import contextlib
2+
import copy
23
import re
34

45
import h5py
@@ -252,19 +253,23 @@ def check_uri(self, uri):
252253

253254
class Subject(object):
254255
_rdftype = 'bald__Subject'
255-
def __init__(self, identity, attrs=None, prefixes=None, aliases=None):
256+
def __init__(self, baseuri, relative_id, attrs=None, prefixes=None,
257+
aliases=None, alias_graph=None):
256258
"""
257259
A subject of metadata statements.
258260
259261
attrs: an dictionary of key value pair attributes
260262
"""
261-
self.identity = identity
263+
self.baseuri = baseuri
264+
self.relative_id = relative_id
265+
262266
if attrs is None:
263267
attrs = {}
264268
if prefixes is None:
265269
prefixes = {}
266270
if aliases is None:
267271
aliases = {}
272+
268273
self.attrs = attrs
269274

270275
self.rdf__type = self._rdftype
@@ -275,6 +280,13 @@ def __init__(self, identity, attrs=None, prefixes=None, aliases=None):
275280
_http_p = 'http[s]?://.*'
276281
self._http_uri = re.compile('{}'.format(_http_p))
277282
self._http_uri_prefix = re.compile('{}/|#'.format(_http_p))
283+
if alias_graph is None:
284+
alias_graph = rdflib.Graph()
285+
self.alias_graph = alias_graph
286+
287+
@property
288+
def identity(self):
289+
return '/'.join([self.baseuri, self.relative_id])
278290

279291
def __str__(self):
280292
return '{}:{}: {}'.format(self.identity, type(self), self.attrs)
@@ -283,8 +295,9 @@ def __repr__(self):
283295
return str(self)
284296

285297
def __setattr__(self, attr, value):
286-
reserved_attrs = ['identity', 'prefixes', '_prefixes', '_prefix_suffix',
287-
'_http_uri_prefix', '_http_uri', 'aliases', 'attrs', '_rdftype']
298+
reserved_attrs = ['baseuri', 'relative_id', 'prefixes', '_prefixes',
299+
'_prefix_suffix', '_http_uri_prefix', '_http_uri',
300+
'aliases', 'alias_graph', 'attrs', '_rdftype']
288301
if attr in reserved_attrs:
289302
object.__setattr__(self, attr, value)
290303
else:
@@ -307,6 +320,7 @@ def __getattr__(self, attr):
307320
raise AttributeError(msg)
308321
return self.attrs[attr]
309322

323+
#@property
310324
def prefixes(self):
311325
prefixes = {}
312326
for key, value in self._prefixes.items():
@@ -318,23 +332,77 @@ def prefixes(self):
318332
prefixes[pref] = value
319333
return prefixes
320334

321-
def unpack_uri(self, astring):
322-
"""
323-
Return a URI for the given input string, or return the astring unchanged if
324-
none is available.
335+
def unpack_predicate(self, astring):
336+
result = astring
337+
if isinstance(astring, six.string_types) and self._prefix_suffix.match(astring):
338+
prefix, suffix = self._prefix_suffix.match(astring).groups()
339+
if prefix in self.prefixes():
340+
if self._http_uri.match(self.prefixes()[prefix]):
341+
result = astring.replace('{}__'.format(prefix),
342+
self.prefixes()[prefix])
343+
elif isinstance(astring, six.string_types):
344+
predicate_alias_query = ('prefix dct: <http://purl.org/dc/terms/> '
345+
'prefix owl: <http://www.w3.org/2002/07/owl#> '
346+
'select ?uri where '
347+
'{{?uri dct:identifier "{}" ; '
348+
' rdf:type ?type. '
349+
'FILTER(?type in (rdf:Property, owl:ObjectProperty) ) '
350+
'}}'.format(astring))
351+
352+
qres = self.alias_graph.query(predicate_alias_query)
353+
results = list(qres)
354+
if len(results) > 1:
355+
raise ValueError('multiple alias options')
356+
elif len(results) == 1:
357+
result = str(results[0][0])
358+
if result == astring:
359+
result = self.baseuri + '/' + result
360+
return result
325361

326-
"""
362+
def unpack_rdfobject(self, astring, predicate):
327363
result = astring
328364
if isinstance(astring, six.string_types) and self._prefix_suffix.match(astring):
329365
prefix, suffix = self._prefix_suffix.match(astring).groups()
330366
if prefix in self.prefixes():
331367
if self._http_uri.match(self.prefixes()[prefix]):
332368
result = astring.replace('{}__'.format(prefix),
333369
self.prefixes()[prefix])
334-
elif isinstance(astring, six.string_types) and astring in self.aliases:
335-
result = self.aliases[astring]
370+
elif isinstance(astring, six.string_types):
371+
# if not is_http_uri(predicate):
372+
# msg = 'predicate must be a http uri, not {}'.format(predicate)
373+
# raise ValueError(msg)
374+
# can be a file uri too
375+
rdfobj_alias_query = ('prefix dct: <http://purl.org/dc/terms/> '
376+
'select ?uri where '
377+
'{{ <{pred}> rdfs:range ?range . '
378+
'?uri dct:identifier "{id}" ; '
379+
' rdf:type ?range .'
380+
'}}'.format(pred=predicate, id=astring))
381+
qres = self.alias_graph.query(rdfobj_alias_query)
382+
results = list(qres)
383+
if len(results) > 1:
384+
raise ValueError('multiple alias options')
385+
elif len(results) == 1:
386+
result = str(results[0][0])
336387
return result
337388

389+
# def unpack_uri(self, astring):
390+
# """
391+
# Return a URI for the given input string, or return the astring unchanged if
392+
# none is available.
393+
394+
# """
395+
# result = astring
396+
# if isinstance(astring, six.string_types) and self._prefix_suffix.match(astring):
397+
# prefix, suffix = self._prefix_suffix.match(astring).groups()
398+
# if prefix in self.prefixes():
399+
# if self._http_uri.match(self.prefixes()[prefix]):
400+
# result = astring.replace('{}__'.format(prefix),
401+
# self.prefixes()[prefix])
402+
# elif isinstance(astring, six.string_types) and astring in self.aliases:
403+
# result = self.aliases[astring]
404+
# return result
405+
338406
@property
339407
def link_template(self):
340408
return '<a xlink:href="{url}" xlink:show=new text-decoration="underline">{key}</a>'
@@ -425,13 +493,27 @@ def rdfnode(self, graph):
425493
if not (isinstance(objs, set) or isinstance(objs, list)):
426494
objs = set([objs])
427495
for obj in objs:
496+
rdfpred = self.unpack_predicate(attr)
428497
if isinstance(obj, Subject):
429498
rdfobj = rdflib.URIRef(obj.identity)
430-
elif is_http_uri(self.unpack_uri(obj)):
431-
rdfobj = rdflib.URIRef(self.unpack_uri(obj))
432499
else:
433-
rdfobj = rdflib.Literal(obj)
434-
graph.add((selfnode, rdflib.URIRef(self.unpack_uri(attr)), rdfobj))
500+
rdfobj = self.unpack_rdfobject(obj, rdfpred)
501+
if is_http_uri(rdfobj):
502+
rdfobj = rdflib.URIRef(rdfobj)
503+
else:
504+
rdfobj = rdflib.Literal(rdfobj)
505+
rdfpred = rdflib.URIRef(rdfpred)
506+
try:
507+
graph.add((selfnode, rdfpred, rdfobj))
508+
509+
except AssertionError:
510+
511+
graph.add((selfnode, rdfpred, rdfobj))
512+
# elif is_http_uri(self.unpack_uri(obj)):
513+
# rdfobj = rdflib.URIRef(self.unpack_uri(obj))
514+
# else:
515+
# rdfobj = rdflib.Literal(obj)
516+
# graph.add((selfnode, rdflib.URIRef(self.unpack_uri(attr)), rdfobj))
435517
if isinstance(obj, Subject):
436518
obj_ref = rdflib.URIRef(obj.identity)
437519
if (obj_ref, None, None) not in graph:
@@ -446,11 +528,23 @@ def rdfgraph(self):
446528
"""
447529
graph = rdflib.Graph()
448530
graph.bind('bald', 'http://binary-array-ld.net/latest/')
449-
for prefix_name in self._prefixes:
450-
#strip the double underscore suffix
451-
new_name = prefix_name[:-2]
531+
for prefix_name in self.prefixes():
532+
533+
#strip the double underscore suffix
534+
535+
# new_name = prefix_name[:-2]
452536

453-
graph.bind(new_name, self._prefixes[prefix_name])
537+
graph.bind(prefix_name, self.prefixes()[prefix_name])
538+
539+
for alias_name in self.aliases:
540+
# hack :S
541+
uri = self.aliases[alias_name]
542+
if '?_format' in uri:
543+
uri = uri.split('?')[0]
544+
if not (uri.endswith('#') or uri.endswith('/')):
545+
uri = uri + '/'
546+
graph.bind(alias_name, uri)
547+
454548
graph = self.rdfnode(graph)
455549

456550
return graph
@@ -532,12 +626,13 @@ def load(afilepath):
532626
finally:
533627
f.close()
534628

535-
def load_netcdf(afilepath, baseuri=None):
629+
def load_netcdf(afilepath, baseuri=None, alias_dict=None):
536630
"""
537631
Load a file with respect to binary-array-linked-data.
538632
Returns a :class:`bald.Collection`
539633
"""
540-
634+
if alias_dict == None:
635+
alias_dict = {}
541636
with load(afilepath) as fhandle:
542637
if baseuri is None:
543638
baseuri = 'file://{}'.format(afilepath)
@@ -581,36 +676,39 @@ def load_netcdf(afilepath, baseuri=None):
581676
if isinstance(alias_var, netCDF4._netCDF4.Variable):
582677
skipped_variables.append(alias_var.name)
583678

679+
aliases = careful_update(aliases, alias_dict)
584680
attrs = {}
585681
for k in fhandle.ncattrs():
586682
attrs[k] = getattr(fhandle, k)
587-
# process Conventions
588-
# Conventions = "CF-1.6, ACDD-1.3"
683+
589684
aliasgraph = rdflib.Graph()
590-
if hasattr(fhandle, 'Conventions'):
591-
conventions = [c.strip() for c in fhandle.Conventions.split(',')]
592-
for conv in conventions:
593-
if conv.startswith('CF-'):
594-
uri = 'http://def.scitools.org.uk/CFTerms?_format=ttl'
595-
result = aliasgraph.parse(uri)
596-
qstr = ('select ?alias ?uri where '
597-
'{?uri dct:identifier ?alias .}')
598-
qres = aliasgraph.query(qstr)
599-
600-
new_aliases = [(str(q[0]), str(q[1])) for q in list(qres)]
601-
na_keys = [n[0] for n in new_aliases]
602-
if len(set(na_keys)) != len(na_keys):
603-
raise ValueError('duplicate aliases')
604-
aliases = careful_update(aliases, dict(new_aliases))
605-
685+
for alias in aliases:
686+
aliasgraph.parse(aliases[alias], format='xml')
687+
# if hasattr(fhandle, 'Conventions'):
688+
# conventions = [c.strip() for c in fhandle.Conventions.split(',')]
689+
# for conv in conventions:
690+
# if conv.startswith('CF-'):
691+
# uri = 'http://def.scitools.org.uk/CFTerms?_format=ttl'
692+
# aliasgraph.parse(uri)
693+
# uri = 'http://vocab.nerc.ac.uk/standard_name/'
694+
# aliasgraph.parse(uri, format='xml')
695+
# qstr = ('select ?alias ?uri where '
696+
# '{?uri dct:identifier ?alias .}')
697+
# qres = aliasgraph.query(qstr)
698+
699+
# new_aliases = [(str(q[0]), str(q[1])) for q in list(qres)]
700+
# na_keys = [n[0] for n in new_aliases]
701+
# if len(set(na_keys)) != len(na_keys):
702+
# raise ValueError('duplicate aliases')
703+
# aliases = careful_update(aliases, dict(new_aliases))
606704

607-
root_container = Container(identity, attrs, prefixes=prefixes,
608-
aliases=aliases)
705+
root_container = Container(baseuri, '', attrs, prefixes=prefixes,
706+
aliases=aliases, alias_graph=aliasgraph)
609707

610708
root_container.attrs['bald__contains'] = []
611709
file_variables = {}
612710
for name in fhandle.variables:
613-
if name == prefix_var_name or name == alias_var_name:
711+
if name == prefix_var_name:
614712
continue
615713

616714
sattrs = fhandle.variables[name].__dict__.copy()
@@ -627,21 +725,25 @@ def load_netcdf(afilepath, baseuri=None):
627725

628726
if fhandle.variables[name].shape:
629727
sattrs['bald__shape'] = fhandle.variables[name].shape
630-
var = Array(identity, sattrs, prefixes=prefixes, aliases=aliases)
728+
var = Array(baseuri, name, sattrs, prefixes=prefixes,
729+
aliases=aliases, alias_graph=aliasgraph)
631730
else:
632-
var = Subject(identity, sattrs, prefixes=prefixes, aliases=aliases)
731+
var = Subject(baseuri, name, sattrs, prefixes=prefixes,
732+
aliases=aliases, alias_graph=aliasgraph)
633733
root_container.attrs['bald__contains'].append(var)
634734
file_variables[name] = var
635735

636736

637737
reference_prefixes = dict()
638-
reference_graph = aliasgraph
738+
reference_graph = copy.copy(aliasgraph)
639739
reference_graph.parse('http://binary-array-ld.net/latest?_format=ttl')
640740
qstr = ('prefix bald: <http://binary-array-ld.net/latest/> '
741+
'prefix skos: <http://www.w3.org/2004/02/skos/core#> '
641742
'select ?s '
642743
'where { '
643744
' ?s rdfs:range ?type . '
644-
'filter(?type != rdfs:Literal)'
745+
'filter(?type != rdfs:Literal) '
746+
'filter(?type != skos:Concept) '
645747
'}')
646748
refs = reference_graph.query(qstr)
647749

@@ -650,7 +752,7 @@ def load_netcdf(afilepath, baseuri=None):
650752

651753
# cycle again and find references
652754
for name in fhandle.variables:
653-
if name == prefix_var_name or name == alias_var_name:
755+
if name == prefix_var_name:
654756
continue
655757

656758
var = file_variables[name]
@@ -662,7 +764,7 @@ def load_netcdf(afilepath, baseuri=None):
662764

663765
# for sattr in sattrs:
664766
for sattr in (sattr for sattr in sattrs if
665-
root_container.unpack_uri(sattr) in ref_prefs):
767+
root_container.unpack_predicate(sattr) in ref_prefs):
666768
# if sattr == 'coordinates':
667769
# import pdb; pdb.set_trace()
668770

@@ -712,7 +814,10 @@ def load_netcdf(afilepath, baseuri=None):
712814
reshape[cvi] = fhandle.variables[dim].size
713815
rattrs['bald__childBroadcast'] = tuple(reshape)
714816
rattrs['bald__array'] = set((file_variables.get(dim),))
715-
ref_node = Subject(identity, rattrs, prefixes=prefixes, aliases=aliases)
817+
ref_node = Subject(baseuri, name, rattrs,
818+
prefixes=prefixes,
819+
aliases=aliases,
820+
alias_graph=aliasgraph)
716821
root_container.attrs['bald__contains'].append(ref_node)
717822
file_variables[name] = ref_node
718823
refset.add(ref_node)

0 commit comments

Comments
 (0)