Skip to content

Commit 88c0c57

Browse files
authored
Add check-equality task (#231)
* implement check equality task to compare two datasets * update readme * add test for check equality task * format * format * clean up * remove unused var * remove unused verify flag * update test script name in github action * Merge branch 'master' into check-equality
1 parent 6870a17 commit 88c0c57

File tree

8 files changed

+196
-16
lines changed

8 files changed

+196
-16
lines changed

.github/workflows/main.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,8 +79,8 @@ jobs:
7979
- name: Test anisotropic downsampling
8080
run: tests/scripts/anisotropic_downsampling.sh
8181

82-
- name: Test compression
83-
run: tests/scripts/compression.sh
82+
- name: Test compression and verification
83+
run: tests/scripts/compression_and_verification.sh
8484

8585
- name: Test in-place compression
8686
run: tests/scripts/in_place_compression.sh

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ Created with [Python3](https://www.python.org/).
2020
* `wkcuber.compress`: Compress WKW cubes for efficient file storage (especially useful for segmentation data)
2121
* `wkcuber.metadata`: Create (or refresh) metadata (with guessing of most parameters)
2222
* `wkcuber.recubing`: Read existing WKW cubes in and write them again specifying the WKW file length. Useful when dataset was written e.g. with file length 1.
23+
* `wkcuber.check_equality`: Compare two WKW datasets to check whether they are equal (e.g., after compressing a dataset, this task can be useful to double-check that the compressed dataset contains the same data).
2324
* Most modules support multiprocessing
2425

2526
## Supported input formats
@@ -89,6 +90,9 @@ python -m wkcuber.metadata --refresh data/target
8990
9091
# Recubing an existing dataset
9192
python -m wkcuber.recubing --layer_name color --dtype uint8 /data/source/wkw /data/target
93+
94+
# Check two datasets for equality
95+
python -m wkcuber.check_equality /data/source /data/target
9296
```
9397

9498
### Parallelization

test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ tests/scripts/simple_tiff_cubing_no_compression.sh
1212
tests/scripts/knossos_conversion.sh
1313
tests/scripts/decompress_reference_mag.sh
1414
tests/scripts/downsampling.sh
15-
tests/scripts/compression.sh
15+
tests/scripts/compression_and_verification.sh
1616
tests/scripts/in_place_compression.sh
1717
tests/scripts/meta_generation.sh
1818
tests/scripts/simple_anisotropic_tiff_cubing.sh

tests/scripts/compression.sh

Lines changed: 0 additions & 9 deletions
This file was deleted.
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
set -xe
2+
python -m wkcuber.compress \
3+
--jobs 2 \
4+
--layer_name color \
5+
testoutput/tiff testoutput/tiff_compress
6+
[ -d testoutput/tiff_compress/color/1 ]
7+
[ -d testoutput/tiff_compress/color/2 ]
8+
[ -d testoutput/tiff_compress/color/4 ]
9+
[ -d testoutput/tiff_compress/color/8 ]
10+
11+
echo "Generate metadata"
12+
python -m wkcuber.metadata --name great_dataset --scale 11.24,11.24,25 testoutput/tiff
13+
python -m wkcuber.metadata --name great_dataset --scale 11.24,11.24,25 testoutput/tiff_compress
14+
15+
echo "Check equality for uncompressed and compressed dataset"
16+
python -m wkcuber.check_equality testoutput/tiff testoutput/tiff_compress
17+
18+
echo "Create broken copy of dataset"
19+
rm -rf testoutput/tiff_compress-broken
20+
cp -R testoutput/tiff_compress{,-broken}
21+
rm -r testoutput/tiff_compress-broken/color/1/z0/y0/x0.wkw
22+
23+
echo "Compare original dataset to broken one and expect to determine difference"
24+
if python -m wkcuber.check_equality testoutput/tiff testoutput/tiff_compress-broken ; then
25+
echo "Equality check did not fail even though the dataset is broken."
26+
exit 1
27+
else
28+
echo "Equality check failed as expected for broken dataset."
29+
exit 0
30+
fi

wkcuber/api/Properties/LayerProperties.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from wkw import wkw
44

55
from wkcuber.mag import Mag
6+
from wkcuber.api.bounding_box import BoundingBox
67

78

89
def extract_num_channels(num_channels_in_properties, path, layer, mag):
@@ -107,6 +108,10 @@ def _add_resolution(self, resolution):
107108
def _delete_resolution(self, resolution):
108109
self._wkw_magnifications.delete(resolution)
109110

111+
def get_bounding_box(self) -> BoundingBox:
112+
113+
return BoundingBox(self.get_bounding_box_offset(), self.get_bounding_box_size())
114+
110115
def get_bounding_box_size(self) -> tuple:
111116
return (
112117
self.bounding_box["width"],

wkcuber/check_equality.py

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
import logging
2+
from argparse import ArgumentParser
3+
from wkcuber.api.Dataset import WKDataset
4+
from wkcuber.api.bounding_box import BoundingBox
5+
import numpy as np
6+
7+
from .utils import (
8+
add_verbose_flag,
9+
open_wkw,
10+
WkwDatasetInfo,
11+
add_distribution_flags,
12+
get_executor_for_args,
13+
wait_and_ensure_success,
14+
setup_logging,
15+
)
16+
from .metadata import detect_resolutions, detect_bbox, detect_layers
17+
import functools
18+
from .compress import BACKUP_EXT
19+
20+
CHUNK_SIZE = 1024
21+
22+
23+
def named_partial(func, *args, **kwargs):
24+
# Propagate __name__ and __doc__ attributes to partial function
25+
partial_func = functools.partial(func, *args, **kwargs)
26+
functools.update_wrapper(partial_func, func)
27+
if hasattr(func, "__annotations__"):
28+
# Generic types cannot be pickled in Python <= 3.6, see https://github.com/python/typing/issues/511
29+
partial_func.__annotations__ = {}
30+
return partial_func
31+
32+
33+
def create_parser():
34+
parser = ArgumentParser()
35+
36+
parser.add_argument("source_path", help="Path to input WKW dataset")
37+
38+
parser.add_argument(
39+
"target_path", help="WKW dataset with which to compare the input dataset."
40+
)
41+
42+
parser.add_argument(
43+
"--layer_name",
44+
"-l",
45+
help="Name of the layer to compare (if not provided, all layers are compared)",
46+
default=None,
47+
)
48+
49+
add_verbose_flag(parser)
50+
add_distribution_flags(parser)
51+
52+
return parser
53+
54+
55+
def assert_equality_for_chunk(
56+
source_path: str, target_path: str, layer_name: str, mag, sub_box
57+
):
58+
wk_dataset = WKDataset(source_path)
59+
layer = wk_dataset.layers[layer_name]
60+
backup_wkw_info = WkwDatasetInfo(target_path, layer_name, mag, header=None)
61+
with open_wkw(backup_wkw_info) as backup_wkw:
62+
mag_ds = layer.get_mag(mag)
63+
logging.info(f"Checking sub_box: {sub_box}")
64+
65+
data = mag_ds.read(sub_box.size, sub_box.topleft)
66+
backup_data = backup_wkw.read(sub_box.topleft, sub_box.size)
67+
assert np.all(
68+
data == backup_data
69+
), f"Data differs in bounding box {sub_box} for layer {layer_name} with mag {mag}"
70+
71+
72+
def check_equality(source_path: str, target_path: str, args=None):
73+
74+
logging.info(f"Comparing {source_path} with {target_path}")
75+
76+
wk_src_dataset = WKDataset(source_path)
77+
src_layer_names = wk_src_dataset.layers.keys()
78+
target_layer_names = [
79+
layer["name"] for layer in detect_layers(target_path, 0, False)
80+
]
81+
assert set(src_layer_names) == set(
82+
target_layer_names
83+
), f"The provided input datasets have different layers: {src_layer_names} != {target_layer_names}"
84+
85+
existing_layer_names = src_layer_names
86+
87+
if args.layer_name is not None:
88+
assert (
89+
args.layer_name in existing_layer_names
90+
), f"Provided layer {args.layer_name} does not exist in input dataset."
91+
existing_layer_names = [args.layer_name]
92+
93+
for layer_name in existing_layer_names:
94+
95+
logging.info(f"Checking layer_name: {layer_name}")
96+
97+
source_mags = list(detect_resolutions(source_path, layer_name))
98+
target_mags = list(detect_resolutions(target_path, layer_name))
99+
source_mags.sort()
100+
target_mags.sort()
101+
mags = source_mags
102+
103+
assert (
104+
source_mags == target_mags
105+
), f"The mags between {source_path}/{layer_name} and {target_path}/{layer_name} are not equal: {source_mags} != {target_mags}"
106+
107+
layer_properties = wk_src_dataset.properties.data_layers[layer_name]
108+
109+
official_bbox = layer_properties.get_bounding_box()
110+
111+
for mag in mags:
112+
inferred_src_bbox = BoundingBox.from_auto(
113+
detect_bbox(source_path, layer_name, mag)
114+
)
115+
inferred_target_bbox = BoundingBox.from_auto(
116+
detect_bbox(target_path, layer_name, mag)
117+
)
118+
119+
bbox = inferred_src_bbox.extended_by(inferred_target_bbox).extended_by(
120+
official_bbox
121+
)
122+
logging.info(f"Start verification of {layer_name} in mag {mag} in {bbox}")
123+
124+
with get_executor_for_args(args) as executor:
125+
boxes = list(
126+
bbox.chunk([CHUNK_SIZE, CHUNK_SIZE, CHUNK_SIZE], [CHUNK_SIZE])
127+
)
128+
assert_fn = named_partial(
129+
assert_equality_for_chunk, source_path, target_path, layer_name, mag
130+
)
131+
132+
wait_and_ensure_success(executor.map_to_futures(assert_fn, boxes))
133+
134+
logging.info(
135+
f"The following datasets seem to be equal (with regard to the layers: {existing_layer_names}):"
136+
)
137+
logging.info(source_path)
138+
logging.info(target_path)
139+
140+
141+
if __name__ == "__main__":
142+
args = create_parser().parse_args()
143+
setup_logging(args)
144+
145+
if args.target_path is None:
146+
target_path = args.source_path + BACKUP_EXT
147+
else:
148+
target_path = args.target_path
149+
check_equality(args.source_path, target_path, args)

wkcuber/compress.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,11 @@
1616
wait_and_ensure_success,
1717
setup_logging,
1818
)
19-
from .metadata import detect_resolutions
20-
from .metadata import convert_element_class_to_dtype
19+
from .metadata import detect_resolutions, convert_element_class_to_dtype
2120
from typing import List
2221

22+
BACKUP_EXT = ".bak"
23+
2324

2425
def create_parser():
2526
parser = ArgumentParser()
@@ -128,11 +129,11 @@ def compress_mags(
128129
compress_mag(source_path, layer_name, target_path, mag, args)
129130

130131
if with_tmp_dir:
131-
makedirs(path.join(source_path + ".bak", layer_name), exist_ok=True)
132+
makedirs(path.join(source_path + BACKUP_EXT, layer_name), exist_ok=True)
132133
for mag in mags:
133134
shutil.move(
134135
path.join(source_path, layer_name, str(mag)),
135-
path.join(source_path + ".bak", layer_name, str(mag)),
136+
path.join(source_path + BACKUP_EXT, layer_name, str(mag)),
136137
)
137138
shutil.move(
138139
path.join(target_path, layer_name, str(mag)),

0 commit comments

Comments
 (0)