Skip to content

Commit cf210d4

Browse files
authored
Background check not remove (#10)
* initial switch to check only * background class name change * background check desc * background check test * rename background module * rename background test * background use in README * module import background * Increment version * type comparison * format changes * flake8 fix * sort file list * count check * pandas dependency * checker removal comments
1 parent d814302 commit cf210d4

File tree

6 files changed

+170
-132
lines changed

6 files changed

+170
-132
lines changed

README.md

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ transformers (ViT) such as [Segment Anything](https://arxiv.org/abs/2304.02643).
1313
or standard scale before writing chips as required.
1414
- **Mask Segmentation**: Generate segmentation mask images from geopackage or shapefile features for supervised
1515
segmentation, e.g using [U-Net](https://arxiv.org/abs/1505.04597).
16-
- **Remove Background Chips**: Filter out image chips containing only background. Useful for when preparing training
16+
- **Check Background Chips**: Identify image chips containing only background. Useful for when preparing training
1717
and testing datasets.
1818

1919
## Installation
@@ -86,23 +86,23 @@ image_chipper = ImageChip(
8686
image_chipper.chip_image()
8787
```
8888

89-
### 3. RemoveBackgroundOnly Class
90-
The `RemoveBackgroundOnly` class provides functionality to remove image chips that contain only background. Filtering out images only containing background helps to prepare a dataset more suitable for training models.
89+
### 3. CheckBackgroundOnly Class
90+
The `CheckBackgroundOnly` class provides functionality to list image chips that contain only background. Filtering out images only containing background helps to prepare a dataset more suitable for training models.
9191

9292
```python
9393
from rschip import RemoveBackgroundOnly
9494

9595
# Initialize the RemoveBackgroundOnly instance
96-
remover = RemoveBackgroundOnly(background_val=0, non_background_min=100)
96+
checker = checkBackgroundOnly(background_val=0, non_background_min=1)
9797

98-
# Remove chips with only background
99-
remover.remove_background_only_files(
98+
# Find chips with only background
99+
checker.check_background_chips(
100100
class_chips_dir="path/to/mask_directory",
101101
image_chips_dir="path/to/image_directory"
102102
)
103103
```
104104
The default assumption is that image and mask equivalent have the same file names as shown in example 2. above. If that is
105-
not the case, use the `masks_prefix`, `images_prefix` arguments which are prefix strings which are removed on checking for
105+
not the case, use the `masks_prefix`, `images_prefix` arguments which are prefix strings which are considered on checking for
106106
image to mask equivalent using the bottom left (x,y) indices found in the outputs generated by `ImageChip.create_chips()`.
107107

108108
## License

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "rschip"
7-
version = "0.3.3"
7+
version = "0.3.4"
88
description = "Prepare satellite images and training data for use with deep learning models"
99
readme = { file = "README.md", content-type = "text/markdown" }
1010
license = { text = "MIT" }
@@ -32,6 +32,7 @@ dependencies = [
3232
"geopandas",
3333
"shapely",
3434
"tqdm",
35+
"pandas",
3536
]
3637
requires-python = ">=3.9"
3738

src/rschip/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
from rschip.image_chip import ImageChip # noqa: F401
22
from rschip.segmentation_mask import SegmentationMask # noqa: F401
3-
from rschip.remove_background_only import RemoveBackgroundOnly # noqa: F401
3+
from rschip.check_background import CheckBackgroundOnly # noqa: F401

src/rschip/check_background.py

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
from pathlib import Path
2+
import rasterio as rio
3+
import numpy as np
4+
from typing import Optional, Union
5+
import pandas as pd
6+
7+
8+
class CheckBackgroundOnly:
9+
"""
10+
Check arrays where the segmentation mask class values show background only.
11+
12+
This class is used to identify image chips that contain only background based on
13+
their corresponding segmentation masks.
14+
15+
Attributes:
16+
background_val (int | float): The value in the mask image array that represents the background class.
17+
Defaults to 0.
18+
non_background_min (int): The minimum number of non-background pixels required to consider a chip
19+
as not background-only. Defaults to 1.
20+
"""
21+
22+
def __init__(
23+
self, background_val: Union[int, float] = 0, non_background_min: int = 1
24+
):
25+
self.background_val = background_val
26+
self.non_background_min = non_background_min
27+
28+
@staticmethod
29+
def _prefix_checker(prefix: Optional[str]) -> str:
30+
return "" if prefix is None else prefix
31+
32+
def _find_image_eq_mask(
33+
self,
34+
class_chip_path: Path,
35+
image_chips_dir: str,
36+
masks_prefix: Optional[str],
37+
images_prefix: Optional[str],
38+
) -> Path:
39+
image_chips_dir = Path(image_chips_dir)
40+
class_file = class_chip_path.name
41+
image_file = class_file.replace(
42+
self._prefix_checker(masks_prefix), self._prefix_checker(images_prefix)
43+
)
44+
return image_chips_dir / image_file
45+
46+
def check_background_only(self, class_arr: np.ndarray) -> bool:
47+
"""
48+
Check if an image mask has fewer than the specified number of non-background pixels.
49+
50+
Args:
51+
class_arr (numpy.ndarray): A 2D NumPy array representing the class labels for each pixel in an image mask.
52+
53+
Returns:
54+
bool: True if the image mask has a non-background pixel count < `non_background_min`. False otherwise.
55+
"""
56+
return np.sum(class_arr != self.background_val) < self.non_background_min
57+
58+
def check_background_chips(
59+
self,
60+
class_chips_dir: str,
61+
image_chips_dir: str,
62+
image_extn: str = "tif",
63+
masks_prefix: Optional[str] = None,
64+
images_prefix: Optional[str] = None,
65+
) -> pd.DataFrame:
66+
"""
67+
Checks chip files to identify which ones are background only and returns a DataFrame.
68+
69+
This method iterates through mask chip files, checks if they are background-only,
70+
and writes the results to a CSV file named 'background_only_check.csv' in the
71+
`class_chips_dir`. The results are also returned as a pandas DataFrame.
72+
73+
The CSV file and DataFrame will contain the following columns:
74+
- mask_file: The path to the mask chip file.
75+
- image_file: The path to the corresponding image chip file.
76+
- is_background_only: A boolean indicating if the mask is background-only.
77+
78+
Args:
79+
class_chips_dir (str): Directory containing the chip mask image files to check.
80+
image_chips_dir (str): Corresponding chip image file directory.
81+
image_extn (str, optional): The extension for the image files. Defaults to "tif".
82+
masks_prefix (str, optional): Prefix for mask files. Defaults to None. This prefix is removed when checking for
83+
equivalent mask to image file.
84+
images_prefix (str, optional): As `masks_prefix`. Prefix for image files. Defaults to None.
85+
86+
Returns:
87+
pd.DataFrame: A DataFrame with the check results.
88+
89+
Raises:
90+
FileNotFoundError: If no files with the specified extension are found in the input directory.
91+
"""
92+
class_chips_dir = Path(class_chips_dir)
93+
mask_files = sorted(list(class_chips_dir.glob(f"**/*.{image_extn}")))
94+
if not mask_files:
95+
raise FileNotFoundError(f"No {image_extn} files found in {class_chips_dir}")
96+
97+
print(f"Checking {len(mask_files)} files in {class_chips_dir}.")
98+
99+
audit_data = []
100+
for f in mask_files:
101+
with rio.open(f) as src:
102+
img = src.read(1)
103+
is_background = self.check_background_only(img)
104+
image_file = self._find_image_eq_mask(
105+
f, image_chips_dir, masks_prefix, images_prefix
106+
)
107+
audit_data.append(
108+
{
109+
"mask_file": f,
110+
"image_file": image_file,
111+
"is_background_only": is_background,
112+
}
113+
)
114+
115+
df = pd.DataFrame(audit_data)
116+
117+
# Save to CSV
118+
output_csv_path = class_chips_dir / "background_only_check.csv"
119+
df.to_csv(output_csv_path, index=False)
120+
121+
print(f"Check results written to {output_csv_path}")
122+
return df

src/rschip/remove_background_only.py

Lines changed: 0 additions & 95 deletions
This file was deleted.
Lines changed: 38 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
import tempfile
44
from rschip import ImageChip
55
from rschip import SegmentationMask
6-
from rschip import RemoveBackgroundOnly
6+
from rschip import CheckBackgroundOnly
7+
import pandas as pd
78

89

910
@pytest.fixture(scope="function")
@@ -37,13 +38,16 @@ def tif_files_to_list(out_dir):
3738
return list(Path(out_dir).glob("*.tif"))
3839

3940

40-
# repeat the same but for tif file chips not npz
41-
def test_tif_remove(setup_output_dir):
41+
def test_background_check(setup_output_dir):
4242
out_dir = setup_output_dir
4343
out_mask = out_dir / "output_mask.tif"
4444
out_mask_chips = out_dir / "mask_chips"
4545
out_img_chips = out_dir / "img_chips"
4646

47+
# Create directories for chips
48+
out_mask_chips.mkdir()
49+
out_img_chips.mkdir()
50+
4751
mask_creator = SegmentationMask(
4852
"tests/data/test_img.tif", "tests/data/test_features.gpkg", out_mask
4953
)
@@ -62,24 +66,36 @@ def test_tif_remove(setup_output_dir):
6266
tif_img_files_init = len(list(out_img_chips.glob("*.tif")))
6367
tif_mask_files_init = len(list(out_mask_chips.glob("*.tif")))
6468

65-
remover = RemoveBackgroundOnly(background_val=0, non_background_min=100)
69+
checker = CheckBackgroundOnly(background_val=0, non_background_min=100)
6670

67-
remover.remove_background_only_files(out_mask_chips, out_img_chips)
71+
df = checker.check_background_chips(str(out_mask_chips), str(out_img_chips))
6872

6973
tif_img_files_final = len(list(out_img_chips.glob("*.tif")))
7074
tif_mask_files_final = len(list(out_mask_chips.glob("*.tif")))
7175

72-
assert tif_img_files_final < tif_img_files_init, "No img files were removed"
76+
# Check that no files were removed
77+
assert tif_img_files_final == tif_img_files_init, "Image files were removed"
78+
assert tif_mask_files_final == tif_mask_files_init, "Mask files were removed"
7379

74-
assert tif_mask_files_final < tif_mask_files_init, "No chip files were removed"
80+
# Check that CSV was created
81+
csv_path = out_mask_chips / "background_only_check.csv"
82+
assert csv_path.exists(), "Audit CSV file was not created."
7583

76-
assert (
77-
tif_mask_files_final == tif_img_files_final
78-
), "Remaining chips and image file number differs"
84+
# Check DataFrame and CSV content
85+
assert len(df) == tif_mask_files_init
86+
assert "is_background_only" in df.columns
87+
assert df[
88+
"is_background_only"
89+
].any(), "Expected some background-only chips to be True."
90+
assert not df[
91+
"is_background_only"
92+
].all(), "Expected some non-background chips to be False."
7993

94+
csv_df = pd.read_csv(csv_path)
95+
assert len(csv_df) == len(df)
8096

81-
# specifying a large and small background threshold and compare
82-
def test_non_background_min(setup_output_dir):
97+
98+
def test_non_background_min_check(setup_output_dir):
8399
out_dir = setup_output_dir
84100
out_mask = out_dir / "output_mask.tif"
85101
out_mask_chips = out_dir / "mask_chips"
@@ -99,22 +115,16 @@ def test_non_background_min(setup_output_dir):
99115
input_image_path=out_mask,
100116
output_name="test_img",
101117
)
102-
remover = RemoveBackgroundOnly(background_val=0, non_background_min=1)
103-
remover.remove_background_only_files(out_mask_chips, out_img_chips)
104-
105-
tif_img_files_final1 = len(list(out_img_chips.glob("*.tif")))
106-
tif_mask_files_final1 = len(list(out_mask_chips.glob("*.tif")))
118+
# Audit with a low threshold
119+
checker1 = CheckBackgroundOnly(background_val=0, non_background_min=1)
120+
df1 = checker1.check_background_chips(str(out_mask_chips), str(out_img_chips))
121+
background_only_count1 = df1["is_background_only"].sum()
107122

108-
remover = RemoveBackgroundOnly(background_val=0, non_background_min=10000)
109-
remover.remove_background_only_files(out_mask_chips, out_img_chips)
110-
111-
tif_img_files_final2 = len(list(out_img_chips.glob("*.tif")))
112-
tif_mask_files_final2 = len(list(out_mask_chips.glob("*.tif")))
113-
114-
assert (
115-
tif_img_files_final2 < tif_img_files_final1
116-
), "Non background threshold on images failed"
123+
# Audit with a high threshold
124+
checker2 = CheckBackgroundOnly(background_val=0, non_background_min=10000)
125+
df2 = checker2.check_background_chips(str(out_mask_chips), str(out_img_chips))
126+
background_only_count2 = df2["is_background_only"].sum()
117127

118128
assert (
119-
tif_mask_files_final2 < tif_mask_files_final1
120-
), "Non background threshold on masks failed"
129+
background_only_count2 >= background_only_count1
130+
), "Higher non_background_min threshold should result in more background-only chips."

0 commit comments

Comments
 (0)