Skip to content

Commit 70f82d3

Browse files
authored
Merge pull request #9 from marqh/prefcont
Prefcont
2 parents dcfa808 + ef83273 commit 70f82d3

File tree

11 files changed

+311
-21
lines changed

11 files changed

+311
-21
lines changed

.travis.yml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
language: python
2+
python:
3+
- "2.7"
4+
install:
5+
- wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
6+
- bash Miniconda3-latest-Linux-x86_64.sh -b -p $HOME/miniconda
7+
- export PATH="$HOME/miniconda/bin:$PATH"
8+
9+
- conda config --add channels conda-forge
10+
- conda config --add channels bioconda
11+
- conda config --set always_yes yes --set changeps1 no
12+
- conda config --set show_channel_urls True
13+
14+
- ENV_NAME='testing'
15+
- conda create --quiet -n $ENV_NAME python=$TRAVIS_PYTHON_VERSION
16+
- source activate $ENV_NAME
17+
- conda install --quiet --file conda-requirements.txt
18+
- conda list
19+
- conda info -a
20+
- python setup.py --quiet install
21+
22+
script:
23+
python -m unittest discover -s bald.tests -v

README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1-
# bald
1+
# Binary Array Linked Data: bald
2+
23
[![Build Status](https://api.travis-ci.org/repositories/binary-array-ld/bald.svg?branch=master)](http://travis-ci.org/binary-array-ld/bald/branches)
3-
Python library for validating and managing binary array linked data files.
4+
5+
A Python library for validating and managing binary array linked data files.

conda-requirements.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
numpy
2+
h5py
3+
netCDF4
4+
requests
5+
rdflib

lib/bald/__init__.py

Lines changed: 36 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
import bald.validation as bv
99

10+
__version__ = '0.2'
1011

1112
class HttpCache(object):
1213
"""
@@ -15,9 +16,12 @@ class HttpCache(object):
1516
def __init__(self):
1617
self.cache = {}
1718

19+
def is_http_uri(self, item):
20+
return item.startswith('http://') or item.startswith('https://')
21+
1822
def __getitem__(self, item):
1923

20-
if not item.startswith('http://') or item.startswith('https://'):
24+
if not self.is_http_uri(item):
2125
raise ValueError('{} is not a HTTP URI.'.format(item))
2226
if item not in self.cache:
2327
headers = {'Accept': 'text/turtle'}
@@ -33,23 +37,29 @@ def check_uri(self, uri):
3337

3438

3539
class Subject(object):
36-
def __init__(self, attrs=None):
40+
def __init__(self, attrs=None, prefixes=None, aliases=None):
3741
"""
3842
A subject of metadata statements.
3943
4044
attrs: an dictionary of key value pair attributes
4145
"""
4246
if attrs is None:
43-
attrs = []
47+
attrs = {}
48+
if prefixes is None:
49+
prefixes = {}
50+
if aliases is None:
51+
aliases = {}
4452
self.attrs = attrs
53+
self.aliases = aliases
54+
self._prefixes = prefixes
4555
self._prefix_suffix = re.compile('(^(?:(?!__).)*)__((?!.*__).*$)')
4656
_http_p = 'http[s]?://.*'
4757
self._http_uri = re.compile('{}'.format(_http_p))
4858
self._http_uri_prefix = re.compile('{}/|#'.format(_http_p))
4959

5060
def prefixes(self):
5161
prefixes = {}
52-
for key, value in self.attrs.iteritems():
62+
for key, value in self._prefixes.iteritems():
5363
if key.endswith('__') and self._http_uri_prefix.match(value):
5464
pref = key.rstrip('__')
5565
if pref in prefixes:
@@ -66,6 +76,8 @@ def unpack_uri(self, astring):
6676
if self._http_uri.match(self.prefixes()[prefix]):
6777
result = astring.replace('{}__'.format(prefix),
6878
self.prefixes()[prefix])
79+
elif astring in self.aliases:
80+
result = self.aliases[astring]
6981
return result
7082

7183

@@ -105,17 +117,25 @@ def validate_netcdf(afilepath):
105117

106118
with load(afilepath) as fhandle:
107119
sval = bv.StoredValidation()
120+
prefix_group = fhandle[fhandle.bald__isPrefixedBy] if hasattr(fhandle, 'bald__isPrefixedBy') else {}
121+
prefixes = {}
122+
if prefix_group:
123+
prefixes = dict([(prefix, getattr(prefix_group, prefix)) for prefix in prefix_group.ncattrs()])
124+
else:
125+
for k in fhandle.ncattrs():
126+
if k.endswith('__'):
127+
prefixes[k] = getattr(fhandle, k)
108128
attrs = {}
109129
for k in fhandle.ncattrs():
110130
attrs[k] = getattr(fhandle, k)
111-
root_container = Subject(attrs)
131+
root_container = Subject(attrs, prefixes=prefixes)
112132
root_val = bv.ContainerValidation(subject=root_container,
113133
fhandle=fhandle)
114134
sval.stored_exceptions += root_val.exceptions()
115135
for name in fhandle.variables:
116136
sattrs = fhandle.__dict__.copy()
117137
sattrs.update(fhandle.variables[name].__dict__.copy())
118-
var = Subject(sattrs)
138+
var = Subject(sattrs, prefixes=prefixes)
119139
var_val = bv.ArrayValidation(name, fhandle.variables[name], fhandle=fhandle,
120140
subject=var)
121141
sval.stored_exceptions += var_val.exceptions()
@@ -132,7 +152,15 @@ def validate_hdf5(afilepath):
132152
with load(afilepath) as fhandle:
133153
sval = bv.StoredValidation()
134154
cache = {}
135-
root_container = Subject(fhandle.attrs)
155+
prefix_group = fhandle.attrs.get('bald__isPrefixedBy')
156+
prefixes = {}
157+
if prefix_group:
158+
prefixes = fhandle[prefix_group].attrs
159+
alias_group = fhandle.attrs.get('bald__isAliasedBy')
160+
aliases = {}
161+
if alias_group:
162+
aliases = dict(fhandle[alias_group].attrs.iteritems())
163+
root_container = Subject(fhandle.attrs, prefixes=prefixes, aliases=aliases)
136164
root_val = bv.ContainerValidation(subject=root_container,
137165
fhandle=fhandle)
138166
sval.stored_exceptions += root_val.exceptions()
@@ -144,7 +172,7 @@ def validate_hdf5(afilepath):
144172
# #
145173
sattrs = dict(fhandle.attrs).copy()
146174
sattrs.update(dataset.attrs)
147-
dset = Subject(sattrs)
175+
dset = Subject(sattrs, prefixes, aliases)
148176
dset_val = bv.ArrayValidation(name, dataset, fhandle=fhandle,
149177
subject=dset)
150178
sval.stored_exceptions += dset_val.exceptions()
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import unittest
2+
3+
import h5py
4+
import numpy as np
5+
6+
import bald
7+
from bald.tests import BaldTestCase
8+
9+
def _fattrs(f):
10+
f.attrs['rdf__type'] = 'bald__Container'
11+
group_pref = f.create_group('bald__prefix_list')
12+
group_pref.attrs['bald__'] = 'http://binary-array-ld.net/latest/'
13+
group_pref.attrs['rdf__'] = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
14+
f.attrs['bald__isPrefixedBy'] = group_pref.ref
15+
return f
16+
17+
def _create_parent_child(f, pshape, cshape):
18+
dsetp = f.create_dataset("parent_dataset", pshape, dtype='i')
19+
dsetc = f.create_dataset("child_dataset", cshape, dtype='i')
20+
dsetp.attrs['rdf__type'] = 'bald__Array'
21+
dsetp.attrs['bald__references'] = dsetc.ref
22+
dsetc.attrs['rdf__type'] = 'bald__Array'
23+
dsetc.attrs['rdf__type'] = 'bald__Reference'
24+
dsetc.attrs['bald__array'] = dsetc.ref
25+
return f
26+
27+
28+
class Test(BaldTestCase):
29+
30+
def test_valid_uri(self):
31+
with self.temp_filename('.hdf') as tfile:
32+
f = h5py.File(tfile, "w")
33+
f = _fattrs(f)
34+
group_alias = f.create_group('bald__alias_list')
35+
f.attrs['bald__isAliasedBy'] = group_alias.ref
36+
group_alias.attrs['skosPrefLabel'] = 'http://www.w3.org/2004/02/skos/core#prefLabel'
37+
dsetp = f.create_dataset("parent_dataset", (11, 17), dtype='i')
38+
dsetp.attrs['skosPrefLabel'] = 'alabel'
39+
f.close()
40+
validation = bald.validate_hdf5(tfile)
41+
self.assertTrue(validation.is_valid())
42+
43+
def test_invalid_uri(self):
44+
with self.temp_filename('.hdf') as tfile:
45+
f = h5py.File(tfile, "w")
46+
f = _fattrs(f)
47+
f.attrs['bald__turtle'] = 'bald__walnut'
48+
group_alias = f.create_group('bald__alias_list')
49+
f.attrs['bald__isAliasedBy'] = group_alias.ref
50+
group_alias.attrs['skosPrefLabel'] = 'http://www.w3.org/2004/02/skos/core#notThisPrefLabel'
51+
dsetp = f.create_dataset("parent_dataset", (11, 17), dtype='i')
52+
dsetp.attrs['skosPrefLabel'] = 'alabel'
53+
f.close()
54+
validation = bald.validate_hdf5(tfile)
55+
self.assertFalse(validation.is_valid())
56+
57+
58+
if __name__ == '__main__':
59+
unittest.main()

lib/bald/tests/integration/test_netcdf.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,11 @@
88
from bald.tests import BaldTestCase
99

1010
def _fattrs(f):
11-
f.bald__ = 'http://binary-array-ld.net/latest/'
12-
f.rdf__ = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
1311
f.rdf__type = 'bald__Container'
12+
group_pref = f.createGroup('bald__prefix_list')
13+
group_pref.bald__ = 'http://binary-array-ld.net/latest/'
14+
group_pref.rdf__ = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
15+
f.bald__isPrefixedBy = 'bald__prefix_list'
1416
return f
1517

1618
def _create_parent_child(f, pshape, cshape):
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
import unittest
2+
3+
import h5py
4+
import netCDF4
5+
import numpy as np
6+
7+
import bald
8+
from bald.tests import BaldTestCase
9+
10+
def _fattrs(f):
11+
f.bald__ = 'http://binary-array-ld.net/latest/'
12+
f.rdf__ = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
13+
f.rdf__type = 'bald__Container'
14+
return f
15+
16+
def _create_parent_child(f, pshape, cshape):
17+
for i, pdimsize in enumerate(pshape):
18+
f.createDimension("pdim{}".format(str(i)), pdimsize)
19+
for i, cdimsize in enumerate(cshape):
20+
f.createDimension("cdim{}".format(str(i)), cdimsize)
21+
varp = f.createVariable("parent_variable", 'i4', tuple(["pdim{}".format(str(i)) for i, _ in enumerate(pshape)]))
22+
varc = f.createVariable("child_variable", 'i4', tuple(["cdim{}".format(str(i)) for i, _ in enumerate(cshape)]))
23+
varp.rdf__type = 'bald__Array'
24+
varp.bald__references = "child_variable"
25+
varc.rdf__type = 'bald__Array'
26+
varc.rdf__type = 'bald__Reference'
27+
varc.bald__array = "child_variable"
28+
return f
29+
30+
31+
class Test(BaldTestCase):
32+
33+
def test_valid_uri(self):
34+
with self.temp_filename('.nc') as tfile:
35+
f = netCDF4.Dataset(tfile, "w", format="NETCDF4_CLASSIC")
36+
37+
f = _fattrs(f)
38+
f.close()
39+
validation = bald.validate_netcdf(tfile)
40+
self.assertTrue(validation.is_valid())
41+
42+
def test_invalid_uri(self):
43+
with self.temp_filename('.nc') as tfile:
44+
f = netCDF4.Dataset(tfile, "w", format="NETCDF4_CLASSIC")
45+
46+
f = _fattrs(f)
47+
setattr(f, 'bald__turtle', 'bald__walnut')
48+
f.close()
49+
validation = bald.validate_netcdf(tfile)
50+
self.assertFalse(validation.is_valid())
51+
52+
53+
class TestArrayReference(BaldTestCase):
54+
def test_match(self):
55+
with self.temp_filename('.nc') as tfile:
56+
f = netCDF4.Dataset(tfile, "w", format="NETCDF4_CLASSIC")
57+
f = _fattrs(f)
58+
f = _create_parent_child(f, (11, 17), (11, 17))
59+
f.close()
60+
validation = bald.validate_netcdf(tfile)
61+
self.assertTrue(validation.is_valid())
62+
63+
def test_mismatch_zeroth(self):
64+
with self.temp_filename('.nc') as tfile:
65+
f = netCDF4.Dataset(tfile, "w", format="NETCDF4_CLASSIC")
66+
f = _fattrs(f)
67+
f = _create_parent_child(f, (11, 17), (11, 13))
68+
f.close()
69+
validation = bald.validate_netcdf(tfile)
70+
self.assertFalse(validation.is_valid())
71+
72+
73+
if __name__ == '__main__':
74+
unittest.main()
75+
76+
77+
78+

lib/bald/tests/integration/test_validation.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,11 @@
77
from bald.tests import BaldTestCase
88

99
def _fattrs(f):
10-
f.attrs['bald__'] = 'http://binary-array-ld.net/latest/'
11-
f.attrs['rdf__'] = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
1210
f.attrs['rdf__type'] = 'bald__Container'
11+
group_pref = f.create_group('bald__prefix_list')
12+
group_pref.attrs['bald__'] = 'http://binary-array-ld.net/latest/'
13+
group_pref.attrs['rdf__'] = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
14+
f.attrs['bald__isPrefixedBy'] = group_pref.ref
1315
return f
1416

1517
def _create_parent_child(f, pshape, cshape):

lib/bald/tests/unit/test_validation.py renamed to lib/bald/tests/unit/test_HttpCache.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,19 @@
44
import numpy as np
55

66
from bald.tests import BaldTestCase
7-
from bald import validation
7+
import bald
8+
9+
class TestHttpCache(unittest.TestCase):
10+
def setUp(self):
11+
self.cache = bald.HttpCache()
812

9-
class Test(unittest.TestCase):
1013
def test_check_uri_200(self):
1114
auri = 'http://binary-array-ld.net/experimental'
12-
self.assertTrue(validation.check_uri(auri))
15+
self.assertTrue(self.cache.check_uri(auri))
1316

1417
def test_check_uri_404(self):
1518
notauri = 'http://binary-array-ld.net/experimentalish'
16-
self.assertFalse(validation.check_uri(notauri))
19+
self.assertFalse(self.cache.check_uri(notauri))
1720

1821

1922
if __name__ == '__main__':

lib/bald/validation.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -91,12 +91,16 @@ def _check_uri(uri, exceptions):
9191
for pref, uri in self.subject.prefixes().iteritems():
9292
exceptions = _check_uri(self.subject.unpack_uri(uri),
9393
exceptions)
94+
for alias, uri in self.subject.aliases.iteritems():
95+
exceptions = _check_uri(self.subject.unpack_uri(uri),
96+
exceptions)
9497
for attr, value in self.subject.attrs.iteritems():
9598
exceptions = _check_uri(self.subject.unpack_uri(attr),
9699
exceptions)
97100
if isinstance(value, str):
98-
exceptions = _check_uri(self.subject.unpack_uri(value),
99-
exceptions)
101+
val = self.subject.unpack_uri(value)
102+
if self.cache.is_http_uri(val):
103+
exceptions = _check_uri(val, exceptions)
100104
return exceptions
101105

102106
def check_attr_domain_range(self, exceptions):
@@ -106,7 +110,11 @@ def check_attr_domain_range(self, exceptions):
106110
# thus we have a payload
107111
# go rdf
108112
g = rdflib.Graph()
109-
g.parse(data=self.cache[uri].text, format="n3")
113+
data=self.cache[uri].text
114+
try:
115+
g.parse(data=self.cache[uri].text, format="n3")
116+
except Exception:
117+
g.parse(data=self.cache[uri].text, format="xml")
110118
query = ('SELECT ?s \n'
111119
'(GROUP_CONCAT(?domain; SEPARATOR=" | ") AS ?domains)'
112120
' \n'

0 commit comments

Comments
 (0)