Skip to content

Commit 3573cc7

Browse files
authored
Added an ability to check table's integrity (#257)
* Added an ability to checks table's integrity * Fixed error message * Fixed linting
1 parent 2ddf01e commit 3573cc7

File tree

5 files changed

+83
-15
lines changed

5 files changed

+83
-15
lines changed

README.md

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -214,13 +214,14 @@ Constructor to instantiate `Table` class. If `references` argument is provided,
214214

215215
- `(str/None)` - returns the table's SHA256 hash if it's already read using e.g. `table.read`, otherwise returns `None`. In the middle of an iteration it returns hash of already read contents
216216

217-
#### `table.iter(keyed=Fase, extended=False, cast=True, relations=False, foreign_keys_values=False)`
217+
#### `table.iter(keyed=Fase, extended=False, cast=True, integrity=False, relations=False, foreign_keys_values=False)`
218218

219219
Iterates through the table data and emits rows cast based on table schema. Data casting can be disabled.
220220

221221
- `keyed (bool)` - iterate keyed rows
222222
- `extended (bool)` - iterate extended rows
223223
- `cast (bool)` - disable data casting if false
224+
- `integrity` (dict) - dictionary in a form of `{'size': <bytes>, 'hash': '<sha256>'}` to check integrity of the table when it's read completely. Both keys are optional.
224225
- `relations (dict)` - dictionary of foreign key references in a form of `{resource1: [{field1: value1, field2: value2}, ...], ...}`. If provided, foreign key fields will checked and resolved to one of their references (/!\ one-to-many fk are not completely resolved).
225226
- `foreign_keys_values (dict)` - three-level dictionary of foreign key references optimized to speed up validation process in a form of `{resource1: { (foreign_key_field1, foreign_key_field2) : { (value1, value2) : {one_keyedrow}, ... }}}`. If not provided but relations is true, it will be created before the validation process by *index_foreign_keys_values* method
226227
- `(exceptions.TableSchemaException)` - raises any error that occurs during this process
@@ -229,13 +230,14 @@ Iterates through the table data and emits rows cast based on table schema. Data
229230
- `{header1: value1, header2: value2}` - keyed
230231
- `[rowNumber, [header1, header2], [value1, value2]]` - extended
231232

232-
#### `table.read(keyed=False, extended=False, cast=True, relations=False, limit=None, foreign_keys_values=False)`
233+
#### `table.read(keyed=False, extended=False, cast=True, integrity=False, relations=False, limit=None, foreign_keys_values=False)`
233234

234235
Read the whole table and returns as array of rows. Count of rows could be limited.
235236

236237
- `keyed (bool)` - flag to emit keyed rows
237238
- `extended (bool)` - flag to emit extended rows
238239
- `cast (bool)` - flag to disable data casting if false
240+
- `integrity` (dict) - dictionary in a form of `{'size': <bytes>, 'hash': '<sha256>'}` to check integrity of the table when it's read completely. Both keys are optional.
239241
- `relations (dict)` - dict of foreign key references in a form of `{resource1: [{field1: value1, field2: value2}, ...], ...}`. If provided foreign key fields will checked and resolved to its references
240242
- `limit (int)` - integer limit of rows to return
241243
- `foreign_keys_values (dict)` - three-level dictionary of foreign key references optimized to speed up validation process in a form of `{resource1: { (foreign_key_field1, foreign_key_field2) : { (value1, value2) : {one_keyedrow}, ... }}}`
@@ -646,10 +648,14 @@ All validation errors.
646648

647649
All value cast errors.
648650

649-
#### `exceptions.RelationError`
651+
#### `exceptions.IntegrityError`
650652

651653
All integrity errors.
652654

655+
#### `exceptions.RelationError`
656+
657+
All relations errors.
658+
653659
#### `exceptions.StorageError`
654660

655661
All storage errors.

pylama.ini

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ ignore = E128,E301,E305,E731
66
max_line_length = 100
77

88
[pylama:mccabe]
9-
complexity = 24
9+
complexity = 36
1010

1111
[pylama:*/__init__.py]
1212
ignore = W0611

tableschema/exceptions.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ class CastError(TableSchemaException):
4040
pass
4141

4242

43+
class IntegrityError(TableSchemaException):
44+
pass
45+
46+
4347
class RelationError(TableSchemaException):
4448
pass
4549

tableschema/table.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,8 @@ def hash(self):
8080
if self.__stream:
8181
return self.__stream.hash
8282

83-
def iter(self, keyed=False, extended=False, cast=True, relations=False,
83+
def iter(self, keyed=False, extended=False, cast=True,
84+
integrity=False, relations=False,
8485
foreign_keys_values=False):
8586
"""https://github.com/frictionlessdata/tableschema-py#table
8687
"""
@@ -169,16 +170,31 @@ def iter(self, keyed=False, extended=False, cast=True, relations=False,
169170
else:
170171
yield row
171172

173+
# Check integrity
174+
if integrity:
175+
violations = []
176+
size = integrity.get('size')
177+
hash = integrity.get('hash')
178+
if size and size != self.__stream.size:
179+
violations.append('size "%s"' % self.__stream.size)
180+
if hash and hash != self.__stream.hash:
181+
violations.append('hash "%s"' % self.__stream.hash)
182+
if violations:
183+
message = 'Calculated %s differ(s) from declared value(s)'
184+
raise exceptions.IntegrityError(message % ' and '.join(violations))
185+
172186
# Close stream
173187
self.__stream.close()
174188

175-
def read(self, keyed=False, extended=False, cast=True, relations=False, limit=None,
189+
def read(self, keyed=False, extended=False, cast=True, limit=None,
190+
integrity=False, relations=False,
176191
foreign_keys_values=False):
177192
"""https://github.com/frictionlessdata/tableschema-py#table
178193
"""
179194
result = []
180-
rows = self.iter(keyed=keyed, extended=extended, cast=cast, relations=relations,
181-
foreign_keys_values=foreign_keys_values)
195+
rows = self.iter(keyed=keyed, extended=extended, cast=cast,
196+
integrity=integrity, relations=relations,
197+
foreign_keys_values=foreign_keys_values)
182198
for count, row in enumerate(rows, start=1):
183199
result.append(row)
184200
if count == limit:

tests/test_table.py

Lines changed: 49 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -194,25 +194,28 @@ def test_read_with_headers_field_names_mismatch():
194194
assert 'match schema field names' in str(excinfo.value)
195195

196196

197-
# Stats
197+
# Stats/integrity
198+
199+
SIZE = 63
200+
HASH = '328adab247692a1a405e83c2625d52e366389eabf8a1824931187877e8644774'
198201

199202
def test_size():
200203
table = Table('data/data.csv')
201204
table.read()
202-
assert table.size == 63
205+
assert table.size == SIZE
203206

204207

205208
@pytest.mark.skipif(six.PY2, reason='Support only for Python3')
206209
def test_size_compressed():
207210
table = Table('data/data.csv.zip')
208211
table.read()
209-
assert table.size == 63
212+
assert table.size == SIZE
210213

211214

212215
def test_size_remote():
213216
table = Table(BASE_URL % 'data/data.csv')
214217
table.read()
215-
assert table.size == 63
218+
assert table.size == SIZE
216219

217220

218221
def test_size_not_read():
@@ -223,27 +226,66 @@ def test_size_not_read():
223226
def test_hash():
224227
table = Table('data/data.csv')
225228
table.read()
226-
assert table.hash == '328adab247692a1a405e83c2625d52e366389eabf8a1824931187877e8644774'
229+
assert table.hash == HASH
227230

228231

229232
@pytest.mark.skipif(six.PY2, reason='Support only for Python3')
230233
def test_hash_compressed():
231234
table = Table('data/data.csv.zip')
232235
table.read()
233-
assert table.hash == '328adab247692a1a405e83c2625d52e366389eabf8a1824931187877e8644774'
236+
assert table.hash == HASH
234237

235238

236239
def test_hash_remote():
237240
table = Table(BASE_URL % 'data/data.csv')
238241
table.read()
239-
assert table.hash == '328adab247692a1a405e83c2625d52e366389eabf8a1824931187877e8644774'
242+
assert table.hash == HASH
240243

241244

242245
def test_hash():
243246
table = Table(BASE_URL % 'data/data.csv')
244247
assert table.hash is None
245248

246249

250+
def test_read_integrity():
251+
table = Table('data/data.csv')
252+
table.read(integrity={'size': SIZE, 'hash': HASH})
253+
assert True
254+
255+
def test_read_integrity_error():
256+
table = Table('data/data.csv')
257+
with pytest.raises(exceptions.IntegrityError) as excinfo:
258+
table.read(integrity={'size': SIZE + 1, 'hash': HASH + 'a'})
259+
assert str(SIZE) in str(excinfo.value)
260+
assert HASH in str(excinfo.value)
261+
262+
263+
def test_read_integrity_size():
264+
table = Table('data/data.csv')
265+
table.read(integrity={'size': SIZE})
266+
assert True
267+
268+
269+
def test_read_integrity_size_error():
270+
table = Table('data/data.csv')
271+
with pytest.raises(exceptions.IntegrityError) as excinfo:
272+
table.read(integrity={'size': SIZE + 1})
273+
assert str(SIZE) in str(excinfo.value)
274+
275+
276+
def test_read_integrity_hash():
277+
table = Table('data/data.csv')
278+
table.read(integrity={'hash': HASH})
279+
assert True
280+
281+
282+
def test_read_integrity_hash_error():
283+
table = Table('data/data.csv')
284+
with pytest.raises(exceptions.IntegrityError) as excinfo:
285+
table.read(integrity={'hash': HASH + 'a'})
286+
assert HASH in str(excinfo.value)
287+
288+
247289
# Foreign keys
248290

249291
FK_SOURCE = [

0 commit comments

Comments
 (0)