Skip to content

Commit d814302

Browse files
authored
Remove npz output (#9)
* remove doc npz references * remove npz references in ImageChip * remove npz methods from RemoveBackgroundOnly * test references to npz removed * Increment patch version * formatting fix * imports and formatting
1 parent b897f47 commit d814302

File tree

6 files changed

+19
-276
lines changed

6 files changed

+19
-276
lines changed

README.md

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ image_chipper = ImageChip(
4040
output_path="path/to/output_directory_image",
4141
pixel_dimensions=128,
4242
offset=64,
43-
output_format="tif",
4443
)
4544

4645
# set a min max normaliser
@@ -50,9 +49,8 @@ image_chipper.set_normaliser(min_val=500, max_val=3000)
5049
# Generate chips
5150
image_chipper.chip_image()
5251
```
53-
With the `output_format` parameter set to `"tif"`, each resulting tile is named using a suffix that represents the bottom left `(x, y)`
54-
pixel coordinate position. If output_format is set to `"npz"`, the resulting .npz zip file contains a dictionary of arrays,
55-
where the keys are the same as these tile names. By default, the prefix of each tile name is taken from the input image file name
52+
Each resulting tile is named using a suffix that represents the bottom left `(x, y)`
53+
pixel coordinate position. By default, the prefix of each tile name is taken from the input image file name
5654
(`input_image_path`), unless you specify `output_name`.
5755

5856
Using the parameter `use_multiprocessing=True` (default) makes chipping process faster by using multiple cores.
@@ -84,13 +82,12 @@ image_chipper = ImageChip(
8482
output_name="large_image",
8583
pixel_dimensions=128,
8684
offset=64,
87-
output_format="tif",
8885
)
8986
image_chipper.chip_image()
9087
```
9188

9289
### 3. RemoveBackgroundOnly Class
93-
The `RemoveBackgroundOnly` class provides functionality to remove image chips (either could be tifs or numpy arrays inside npz file) that contain only background. Filtering out images only containing background helps to prepare a dataset more suitable for training models.
90+
The `RemoveBackgroundOnly` class provides functionality to remove image chips that contain only background. Filtering out images only containing background helps to prepare a dataset more suitable for training models.
9491

9592
```python
9693
from rschip import RemoveBackgroundOnly

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "rschip"
7-
version = "0.3.2"
7+
version = "0.3.3"
88
description = "Prepare satellite images and training data for use with deep learning models"
99
readme = { file = "README.md", content-type = "text/markdown" }
1010
license = { text = "MIT" }

src/rschip/image_chip.py

Lines changed: 5 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,6 @@ class ImageChip:
2020
pixel_dimensions (int): The height and width of each tile in pixels. Defaults to 128.
2121
offset (int): The offset used when creating tiles, i.e. the step size. Defaults to 64.
2222
use_multiprocessing (bool): Whether to use multiprocessing for chipping. Defaults to True.
23-
output_format (str): The format of the output files, either 'tif' or 'npz'.
24-
If tif then tif file written per tile window. If npz then `batch_size` batches
25-
of array tiles are written into one npz file. Defaults to tif.
2623
max_batch_size (int): The maximum number of tiles to process in a batch.
2724
If multiprocessing is enabled, the actual batch size may be less. Defaults to 1000.
2825
"""
@@ -35,7 +32,6 @@ def __init__(
3532
pixel_dimensions=128,
3633
offset=64,
3734
use_multiprocessing=True,
38-
output_format="tif",
3935
max_batch_size=1000,
4036
):
4137

@@ -47,7 +43,6 @@ def __init__(
4743
self.standard_scaler = None
4844
self.normaliser = None
4945
self.use_multiprocessing = use_multiprocessing
50-
self.output_format = output_format
5146
self.max_batch_size = max_batch_size
5247
if not self.input_image_path.exists():
5348
raise FileNotFoundError(f"Input image not found: {self.input_image_path}")
@@ -56,10 +51,6 @@ def __init__(
5651
raise ValueError("pixel_dimensions must be a positive integer")
5752
if self.offset <= 0:
5853
raise ValueError("offset must be a positive integer")
59-
if self.output_format not in ("tif", "npz"):
60-
raise ValueError(
61-
f"output_format must be 'tif' or 'npz', got '{self.output_format}'"
62-
)
6354

6455
def _read_image_metadata(self) -> None:
6556
"""
@@ -133,22 +124,6 @@ def _save_chip(self, chip, transform, output_file_path, d_type, src) -> None:
133124
) as dst:
134125
dst.write(chip)
135126

136-
def _save_batch_as_npz(self, batch, batch_index) -> None:
137-
"""
138-
Save a batch of chips as an NPZ file.
139-
140-
Args:
141-
batch (dict): Dictionary containing image chips.
142-
batch_index (int): Index of the batch.
143-
144-
Returns:
145-
None: Writes the batch to an NPZ file at the specified path.
146-
"""
147-
output_file_path = self.output_path / f"batch_{batch_index}.npz"
148-
if output_file_path.exists():
149-
output_file_path.unlink()
150-
np.savez_compressed(output_file_path, **batch)
151-
152127
def _output_file(self, x: int, y: int) -> Path:
153128
"""
154129
Generate the output file path for a chip based on its x and y coordinates.
@@ -428,8 +403,7 @@ def _process_batch(self, batch_vals):
428403
Returns:
429404
None
430405
"""
431-
batch_id, batch = batch_vals
432-
out = {}
406+
_, batch = batch_vals
433407
with rio.open(self.input_image_path) as src:
434408
for x, y, window in batch:
435409
chip = src.read(
@@ -440,16 +414,9 @@ def _process_batch(self, batch_vals):
440414
if self.normaliser:
441415
chip = self.apply_normaliser(chip, self.normaliser)
442416

443-
if self.output_format == "tif":
444-
output_file_path = self._output_file(x, y)
445-
transform = src.window_transform(window)
446-
self._save_chip(chip, transform, output_file_path, chip.dtype, src)
447-
elif self.output_format == "npz":
448-
arr_name = f"{self.output_name}_{x}_{y}"
449-
out[arr_name] = chip
450-
451-
if self.output_format == "npz" and out:
452-
self._save_batch_as_npz(out, batch_id)
417+
output_file_path = self._output_file(x, y)
418+
transform = src.window_transform(window)
419+
self._save_chip(chip, transform, output_file_path, chip.dtype, src)
453420

454421
def _calculate_batches(self, windows):
455422
"""
@@ -486,7 +453,7 @@ def chip_image(self) -> None:
486453
Method uses rasterio to read a satellite image, then splits the image into
487454
smaller square tiles of specified dimensions and saves them to the output path.
488455
489-
The output tile file names are suffixed with x and y offsets and saved as TIFF files or NPZ files.
456+
The output tile file names are suffixed with x and y offsets and saved as TIFF files.
490457
Optionally the chip pixel values can be standard scaled before saving, using a sample of the full image pixels.
491458
492459
Returns:

src/rschip/remove_background_only.py

Lines changed: 0 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -35,19 +35,6 @@ def _find_image_eq_mask(
3535
)
3636
return image_chips_dir / image_file
3737

38-
def _find_img_npz_eq_mask(self, class_npz_file: Path, image_npz_dir: str) -> Path:
39-
image_npz_dir = Path(image_npz_dir)
40-
class_file = class_npz_file.name
41-
return image_npz_dir / class_file
42-
43-
def _find_img_key_from_mask_key(
44-
self, class_key: str, masks_prefix: Optional[str], images_prefix: Optional[str]
45-
) -> str:
46-
img_key = class_key.replace(
47-
self._prefix_checker(masks_prefix), self._prefix_checker(images_prefix)
48-
)
49-
return img_key
50-
5138
def check_background_only(self, class_arr: np.ndarray) -> bool:
5239
"""
5340
Check if an image mask has more than the specified number of non-background pixels.
@@ -106,57 +93,3 @@ def remove_background_only_files(
10693

10794
image_files = list(class_chips_dir.glob(f"**/*.{image_extn}"))
10895
print(f"{len(image_files)} in {class_chips_dir} after.")
109-
110-
def remove_background_only_npz(
111-
self,
112-
class_npz_dir: str,
113-
image_npz_dir: str,
114-
masks_prefix: Optional[str] = None,
115-
images_prefix: Optional[str] = None,
116-
) -> None:
117-
"""
118-
Remove arrays from NPZ files where the mask contains background only.
119-
120-
Args:
121-
class_npz_dir (str): Directory containing the chip mask NPZ files to check.
122-
image_npz_dir (str): Corresponding chip image NPZ file directory - if mask is all background, image is removed too.
123-
masks_prefix (str, optional): Prefix for mask files. Defaults to None. This prefix is removed when checking for
124-
equivalent mask to image file.
125-
images_prefix (str, optional): As `masks_prefix`. Prefix for image files. Defaults to None.
126-
127-
Raises:
128-
FileNotFoundError: If no NPZ files are found in the input directory.
129-
"""
130-
class_npz_dir = Path(class_npz_dir)
131-
npz_files = list(class_npz_dir.glob("**/*.npz"))
132-
if not npz_files:
133-
raise FileNotFoundError(f"No npz files found in {class_npz_dir}")
134-
135-
for f in npz_files:
136-
npz_class_dict = np.load(f)
137-
img_npz_file = self._find_img_npz_eq_mask(f, image_npz_dir)
138-
npz_image_dict = np.load(img_npz_file)
139-
140-
out_class_dict = {}
141-
out_img_dict = {}
142-
143-
print(f"{f.name} initially {len(npz_class_dict.keys())}...")
144-
for key in npz_class_dict.files:
145-
class_arr = npz_class_dict[key]
146-
if not self.check_background_only(class_arr):
147-
out_class_dict[key] = class_arr
148-
img_key = self._find_img_key_from_mask_key(
149-
key, masks_prefix, images_prefix
150-
)
151-
out_img_dict[img_key] = npz_image_dict[img_key]
152-
print(f"{f.name} finally {len(out_class_dict.keys())}...")
153-
npz_class_dict.close()
154-
npz_image_dict.close()
155-
f.unlink()
156-
img_npz_file.unlink()
157-
158-
if out_class_dict:
159-
np.savez_compressed(f, **out_class_dict)
160-
np.savez_compressed(img_npz_file, **out_img_dict)
161-
else:
162-
print(f"No valid entries left in {f.name}, deleting the NPZ file.")

tests/test_image_chip.py

Lines changed: 10 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ def chip_image_run(
2121
pixel_dimensions=128,
2222
offset=64,
2323
use_multiprocessing=True,
24-
output_format="tif",
2524
max_batch_size=10,
2625
scaler=False,
2726
normaliser=False,
@@ -32,7 +31,6 @@ def chip_image_run(
3231
pixel_dimensions=pixel_dimensions,
3332
offset=offset,
3433
use_multiprocessing=use_multiprocessing,
35-
output_format=output_format,
3634
max_batch_size=max_batch_size,
3735
)
3836
if scaler:
@@ -43,106 +41,17 @@ def chip_image_run(
4341
image_chip.chip_image()
4442

4543

46-
def load_npz(npz_file_path):
47-
with np.load(npz_file_path) as data:
48-
test_key = data.files[0]
49-
arr = data[test_key]
50-
return test_key, arr
51-
52-
5344
def load_tif(tif_file_path):
5445
with rio.open(tif_file_path) as f:
5546
arr = f.read()
5647
prof = f.profile
5748
return arr, prof
5849

5950

60-
def npz_files_to_list(out_dir):
61-
array_list = []
62-
for file_path in Path(out_dir).glob("*.npz"):
63-
with np.load(file_path) as data:
64-
for array_name in data.files:
65-
array_list.append(data[array_name])
66-
return array_list
67-
68-
6951
def tif_files_to_list(out_dir):
7052
return list(Path(out_dir).glob("*.tif"))
7153

7254

73-
def test_image_chip(setup_output_dir):
74-
out_dir = setup_output_dir
75-
# Test chipping with TIFF output
76-
chip_image_run(output_path=out_dir)
77-
78-
# Verify that TIFF files were created
79-
tif_files = list(out_dir.glob("*.tif"))
80-
assert len(tif_files) > 0, "No TIFF files were created."
81-
82-
# Test chipping with NPZ output
83-
chip_image_run(output_path=out_dir, output_format="npz")
84-
85-
# Verify that NPZ files were created
86-
npz_files = list(out_dir.glob("*.npz"))
87-
assert len(npz_files) > 0, "No NPZ files were created."
88-
89-
90-
def test_array_equality(setup_output_dir):
91-
out_dir = setup_output_dir
92-
chip_image_run(output_path=out_dir, output_format="tif")
93-
chip_image_run(output_path=out_dir, output_format="npz")
94-
95-
# Load one NPZ and corresponding TIFF file to compare arrays
96-
test_key, npz_arr = load_npz(out_dir / "batch_0.npz")
97-
tif_arr, _ = load_tif(f"{out_dir}/{test_key}.tif")
98-
99-
# Compare arrays
100-
assert np.array_equal(tif_arr, npz_arr), "Arrays are not equal."
101-
102-
103-
def test_image_and_array_count(setup_output_dir):
104-
out_dir = setup_output_dir
105-
106-
chip_image_run(output_path=out_dir, output_format="tif")
107-
chip_image_run(output_path=out_dir, output_format="npz")
108-
109-
# Compare the number of TIFF images and NPZ arrays
110-
npz_array_count = len(npz_files_to_list(out_dir))
111-
tif_file_count = len(tif_files_to_list(out_dir))
112-
assert npz_array_count == tif_file_count, "Mismatch in number of arrays and images."
113-
114-
115-
def test_scaler_functionality():
116-
image_chip = ImageChip(
117-
input_image_path="tests/data/test_img.tif", output_path="tmp"
118-
)
119-
sample_array = np.random.rand(3, 128, 128) * 100
120-
scaler_dict = {
121-
0: {"mean": 50, "std": 10},
122-
1: {"mean": 60, "std": 15},
123-
2: {"mean": 40, "std": 20},
124-
}
125-
scaled_array = image_chip.apply_scaler(sample_array, scaler_dict)
126-
unscaled_array = ImageChip.unapply_scaler(scaled_array, scaler_dict)
127-
assert np.allclose(
128-
sample_array, unscaled_array, atol=1e-1
129-
), "Scaling/unscaling mismatch"
130-
131-
132-
def test_large_window_image(setup_output_dir):
133-
out_dir = setup_output_dir
134-
chip_image_run(
135-
output_path=out_dir, output_format="tif", pixel_dimensions=512, offset=256
136-
)
137-
chip_image_run(
138-
output_path=out_dir, output_format="npz", pixel_dimensions=512, offset=256
139-
)
140-
141-
# Verify that at least one file was created
142-
tif_files = tif_files_to_list(out_dir)
143-
assert len(tif_files) > 0, "No TIFF files were created for large window."
144-
145-
14655
def test_multiprocessor_not(setup_output_dir):
14756
out_dir = setup_output_dir
14857
chip_image_run(output_path=out_dir, use_multiprocessing=True)
@@ -180,7 +89,6 @@ def test_tile_count(setup_output_dir):
18089
input_image_path=input_image_path,
18190
pixel_dimensions=pixel_dimensions,
18291
offset=offset,
183-
output_format="tif",
18492
)
18593

18694
# Verify that the expected number of tiles were created
@@ -193,7 +101,7 @@ def test_tile_count(setup_output_dir):
193101
def test_normalising(setup_output_dir):
194102
out_dir = setup_output_dir
195103

196-
chip_image_run(output_path=out_dir, output_format="tif", normaliser=True)
104+
chip_image_run(output_path=out_dir, normaliser=True)
197105

198106
with rio.open(tif_files_to_list(out_dir)[0]) as f:
199107
test_array = f.read()
@@ -204,7 +112,7 @@ def test_normalising(setup_output_dir):
204112
def test_standard_scaling(setup_output_dir):
205113
out_dir = setup_output_dir
206114

207-
chip_image_run(output_path=out_dir, output_format="tif", scaler=True)
115+
chip_image_run(output_path=out_dir, scaler=True)
208116

209117
with rio.open(tif_files_to_list(out_dir)[0]) as f:
210118
test_array = f.read()
@@ -371,13 +279,14 @@ def test_init_invalid_offset():
371279
)
372280

373281

374-
def test_init_invalid_output_format():
375-
with pytest.raises(ValueError, match="output_format must be 'tif' or 'npz'"):
376-
ImageChip(
377-
input_image_path="tests/data/test_img.tif",
378-
output_path="tmp",
379-
output_format="jpg",
380-
)
282+
def test_image_chip(setup_output_dir):
283+
out_dir = setup_output_dir
284+
# Test chipping with TIFF output
285+
chip_image_run(output_path=out_dir)
286+
287+
# Verify that TIFF files were created
288+
tif_files = list(out_dir.glob("*.tif"))
289+
assert len(tif_files) > 0, "No TIFF files were created."
381290

382291

383292
def test_nodata_val_set_from_image():

0 commit comments

Comments
 (0)