Skip to content

Commit bf2949c

Browse files
authored
v0.9 dataset (#88)
Rename unsupervised to weak supervised Add datasets * Bump version: 0.4.2 * fix TreePolygons tests * local tests pass
1 parent e8f2253 commit bf2949c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

53 files changed

+234
-143
lines changed

.bumpversion.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[bumpversion]
2-
current_version = 0.4.0
2+
current_version = 0.4.2
33
commit = True
44
tag = True
55
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)

.vscode/launch.json

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,22 @@
5151
"cwd": "${workspaceFolder}/data_prep",
5252
"justMyCode": true,
5353
},
54+
{
55+
"name": "Debug test_release.py",
56+
"type": "debugpy",
57+
"request": "launch",
58+
"module": "pytest",
59+
"args": [
60+
"${workspaceFolder}/tests/test_release.py",
61+
"-v"
62+
],
63+
"console": "integratedTerminal",
64+
"cwd": "${workspaceFolder}",
65+
"env": {
66+
"PYTHONPATH": "${workspaceFolder}:${env:PYTHONPATH}"
67+
},
68+
"justMyCode": false
69+
}
5470

5571
]
5672
}

current.png

-444 KB
Loading

data_prep/SelvaBox.py

Lines changed: 43 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,31 @@
55
from tqdm import tqdm
66
from PIL import Image
77
import json
8-
import ast
98
from deepforest.utilities import read_file
109

11-
def download_selvabox():
12-
"""Download and process the SelvaBox dataset from HuggingFace"""
10+
def download_selvabox(force_download=False):
11+
"""Download and process the SelvaBox dataset from HuggingFace
12+
13+
Args:
14+
force_download (bool): If True, re-download parquet files even if cached
15+
"""
1316

1417
# Create output directory (using standard MillionTrees path structure)
1518
output_dir = "/orange/ewhite/DeepForest/SelvaBox"
1619
images_dir = os.path.join(output_dir, "images")
20+
cache_dir = os.path.join(output_dir, "cache")
21+
annotations_csv = os.path.join(output_dir, "annotations.csv")
22+
23+
# Check if dataset already exists locally
24+
if not force_download and os.path.exists(annotations_csv) and os.path.exists(images_dir):
25+
print(f"Dataset already exists at {output_dir}")
26+
print(f"Found {len(os.listdir(images_dir))} images and annotations at {annotations_csv}")
27+
print("Use force_download=True to re-download the dataset")
28+
return annotations_csv
29+
1730
os.makedirs(output_dir, exist_ok=True)
1831
os.makedirs(images_dir, exist_ok=True)
32+
os.makedirs(cache_dir, exist_ok=True)
1933

2034
print("Downloading SelvaBox dataset from HuggingFace...")
2135

@@ -36,10 +50,25 @@ def download_selvabox():
3650
split = file_info['split']
3751
parquet_url = file_info['url']
3852

39-
print(f"Processing {split} split from {parquet_url}")
53+
# Cache parquet files locally
54+
parquet_filename = os.path.basename(parquet_url.split('?')[0]) # Remove query params
55+
cached_parquet_path = os.path.join(cache_dir, f"{split}_{parquet_filename}")
56+
57+
# Download parquet file if not cached or if force_download is True
58+
if force_download or not os.path.exists(cached_parquet_path):
59+
print(f"Downloading {split} split parquet file...")
60+
parquet_response = requests.get(parquet_url, stream=True)
61+
parquet_response.raise_for_status()
62+
63+
with open(cached_parquet_path, 'wb') as f:
64+
for chunk in parquet_response.iter_content(chunk_size=8192):
65+
f.write(chunk)
66+
print(f"Cached {split} split to {cached_parquet_path}")
67+
else:
68+
print(f"Using cached {split} split from {cached_parquet_path}")
4069

41-
# Read the parquet file directly from HuggingFace
42-
df = pd.read_parquet(parquet_url)
70+
# Read from cached file
71+
df = pd.read_parquet(cached_parquet_path)
4372

4473
print(f"Loaded {len(df)} rows from {split} split")
4574

@@ -60,8 +89,12 @@ def download_selvabox():
6089

6190
image_path = os.path.join(images_dir, image_filename)
6291

92+
# Skip saving if image already exists (unless force_download is True)
93+
if not force_download and os.path.exists(image_path):
94+
# Image exists, skip saving but continue to annotations
95+
pass
6396
# Save image from bytes
64-
if isinstance(image_data, dict) and 'bytes' in image_data:
97+
elif isinstance(image_data, dict) and 'bytes' in image_data:
6598
try:
6699
image_bytes = image_data['bytes']
67100

@@ -72,7 +105,6 @@ def download_selvabox():
72105

73106
# Convert to PNG and verify dimensions
74107
with Image.open(temp_tif_path) as img:
75-
img_width, img_height = img.size
76108
# Convert to RGB if necessary
77109
if img.mode != 'RGB':
78110
img = img.convert('RGB')
@@ -159,15 +191,14 @@ def infer_split_from_filename(p: str):
159191
print(f"Annotation bounds - ymax: [{annotations_df['ymax'].min():.2f}, {annotations_df['ymax'].max():.2f}]")
160192

161193
# Save annotations
162-
output_csv = os.path.join(output_dir, "annotations.csv")
163-
annotations_df.to_csv(output_csv, index=False)
164-
print(f"Annotations saved to {output_csv}")
194+
annotations_df.to_csv(annotations_csv, index=False)
195+
print(f"Annotations saved to {annotations_csv}")
165196

166197
# Show sample of the data
167198
print("\nSample annotations:")
168199
print(annotations_df.head())
169200

170-
return output_csv
201+
return annotations_csv
171202

172203
if __name__ == "__main__":
173204
download_selvabox()

data_prep/Troles_bamberg.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,8 @@ def create_shapely_polygons_from_coco_segmentation_json(json_file):
6464
test2_polygons = create_shapely_polygons_from_coco_segmentation_json(test_set2)
6565

6666
train_polygons["existing_split"] = "train"
67-
eval_polygons["existing_split"] = "eval"
67+
# The nomenclature is a bit confusing here, but val is the MillionTrees test set and test1 and test2 are the validation sets from the original paper
68+
eval_polygons["existing_split"] = "test"
6869
test1_polygons["existing_split"] = "test1"
6970
test2_polygons["existing_split"] = "test2"
7071

data_prep/annotation_loop.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ def _build_preannotations(df: pd.DataFrame, dataset_type: str, images_root: Path
9494
preannotations.append(rows.reset_index(drop=True))
9595
return images, preannotations
9696

97-
def upload_eval_splits(version: str, base_dir: str = "/orange/ewhite/web/public/MillionTrees/", num_images: int = 100) -> None:
97+
def upload_eval_splits(version: str, base_dir: str = "/orange/ewhite/web/public/MillionTrees/", num_images: int = 5) -> None:
9898
"""Upload up to `num_images` test images per dataset/split with preannotations to Label Studio.
9999
100100
Projects created: MillionTrees-Eval-<dataset_type>-<split>
@@ -108,11 +108,13 @@ def upload_eval_splits(version: str, base_dir: str = "/orange/ewhite/web/public/
108108
sftp_client = create_sftp_client(
109109
user=os.getenv("USER"),
110110
host=os.getenv("HOST"),
111-
key_filename=os.path.expanduser(os.getenv("KEY_FILENAME"))
111+
key_filename=os.path.expanduser(os.getenv("KEY"))
112112
)
113113

114-
for dataset_type in ("TreeBoxes", "TreePoints"):
115-
for split_name in ("random", "zeroshot"):
114+
#for dataset_type in ("TreeBoxes", "TreePoints"):
115+
# for split_name in ("random", "zeroshot"):
116+
for dataset_type in (["TreeBoxes"]):
117+
for split_name in (["zeroshot"]):
116118
df = _load_test_records(base_dir, version, dataset_type, split_name)
117119
if df.empty:
118120
print(f"No test records for {dataset_type} {split_name}, skipping.")
@@ -296,4 +298,4 @@ def main():
296298

297299
if __name__ == "__main__":
298300
#main()
299-
upload_eval_splits(version="v0.9", base_dir="/orange/ewhite/web/public/MillionTrees/", num_images=100)
301+
upload_eval_splits(version="v0.9", base_dir="/orange/ewhite/web/public/MillionTrees/", num_images=200)

data_prep/download_ofo_unsupervised.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def _read_geopandas(path: str):
1717
import geopandas as gpd # type: ignore
1818
except ImportError as exc:
1919
raise ImportError(
20-
"Optional dependency missing: 'geopandas'. Install with `pip install milliontrees[unsupervised]` or `pip install geopandas`."
20+
"Optional dependency missing: 'geopandas'. Install with `pip install milliontrees[weak_supervised]` or `pip install geopandas`."
2121
) from exc
2222
return gpd.read_file(path)
2323

@@ -28,7 +28,7 @@ def _open_raster(path: str):
2828
from rasterio.windows import Window # noqa: F401
2929
except ImportError as exc:
3030
raise ImportError(
31-
"Optional dependency missing: 'rasterio'. Install with `pip install milliontrees[unsupervised]` or `pip install rasterio`."
31+
"Optional dependency missing: 'rasterio'. Install with `pip install milliontrees[weak_supervised]` or `pip install rasterio`."
3232
) from exc
3333
return __import__('rasterio').open(path)
3434

@@ -77,7 +77,7 @@ def _require_requests():
7777
import requests # type: ignore
7878
except ImportError as exc:
7979
raise ImportError(
80-
"Optional dependency missing: 'requests'. Install with `pip install milliontrees[unsupervised]` or `pip install requests`."
80+
"Optional dependency missing: 'requests'. Install with `pip install milliontrees[weak_supervised]` or `pip install requests`."
8181
) from exc
8282
return __import__('requests')
8383

@@ -236,7 +236,7 @@ def run(
236236
allow_empty: bool = False,
237237
):
238238
"""
239-
Build an unsupervised OFO points parquet by tiling orthomosaics and mapping treetops points.
239+
Build a weak supervised OFO points parquet by tiling orthomosaics and mapping treetops points.
240240
241241
Args:
242242
parquet_path: Input parquet file to load annotations and get missions to download
@@ -249,7 +249,7 @@ def run(
249249

250250
images_dir = os.path.join(milliontrees_image_dir, 'images')
251251
ensure_dir(images_dir)
252-
out_dir = os.path.join(milliontrees_image_dir, 'unsupervised')
252+
out_dir = os.path.join(milliontrees_image_dir, 'weak_supervised')
253253
ensure_dir(out_dir)
254254

255255
# Read annotations parquet to get mission IDs
@@ -354,7 +354,7 @@ def run(
354354
'filename': out_name,
355355
'x': (pts_tile['x'] - x0).astype(int).values,
356356
'y': (pts_tile['y'] - y0).astype(int).values,
357-
'source': 'OFO treetops unsupervised',
357+
'source': 'OFO treetops weak supervised',
358358
'split': 'train',
359359
})
360360
records.append(df_tile)
@@ -364,7 +364,7 @@ def run(
364364
'filename': [out_name],
365365
'x': [np.nan],
366366
'y': [np.nan],
367-
'source': ['OFO treetops unsupervised'],
367+
'source': ['OFO treetops weak supervised'],
368368
'split': ['train'],
369369
}))
370370
finally:
@@ -377,13 +377,13 @@ def run(
377377
df = pd.concat(records, ignore_index=True)
378378
# Remove NaN rows if any
379379
df = df.dropna(subset=['x', 'y'])
380-
out_path = os.path.join(out_dir, "TreePoints_OFO_unsupervised.parquet")
380+
out_path = os.path.join(out_dir, "TreePoints_OFO_weak_supervised.parquet")
381381
df.to_parquet(out_path, index=False)
382-
print(f"Wrote OFO unsupervised points to {out_path}")
382+
print(f"Wrote OFO weak supervised points to {out_path}")
383383

384384

385385
def parse_args():
386-
parser = argparse.ArgumentParser(description='Build OFO unsupervised points dataset')
386+
parser = argparse.ArgumentParser(description='Build OFO weak supervised points dataset')
387387
parser.add_argument('--data_dir', required=True, help='MillionTrees dataset directory')
388388
parser.add_argument('--patch_size', type=int, default=800)
389389
parser.add_argument('--allow_empty', action='store_true')

data_prep/label_studio_utils.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -90,10 +90,8 @@ def label_studio_format(local_image_dir, preannotations, dataset_type):
9090
"value": {
9191
"x": float(row['x']/original_width*100), # Ensure float
9292
"y": float(row['y']/original_height*100), # Ensure float
93-
"width": 1.0, # Add fixed width for visibility
9493
"keypointlabels": ["tree"] # Use fixed label
9594
},
96-
"score": 1.0,
9795
"to_name": "image",
9896
"type": "keypointlabels",
9997
"from_name": "label",
@@ -113,7 +111,6 @@ def label_studio_format(local_image_dir, preannotations, dataset_type):
113111
"rotation": 0,
114112
"rectanglelabels": ["tree"] # Use fixed label
115113
},
116-
"score": 1.0,
117114
"to_name": "image",
118115
"type": "rectanglelabels",
119116
"from_name": "label",
@@ -122,7 +119,13 @@ def label_studio_format(local_image_dir, preannotations, dataset_type):
122119
}
123120
results.append(result)
124121

125-
return {"result": results}
122+
# Return a prediction object compatible with Label Studio expectations.
123+
# Score belongs to the prediction, not individual results.
124+
return {
125+
"result": results,
126+
"score": 1.0,
127+
"model_version": "ground-truth"
128+
}
126129

127130
# check_if_complete label studio images are done
128131
def check_if_complete(annotations):
@@ -319,7 +322,7 @@ def import_image_tasks(label_studio_project, image_names, local_image_dir, datas
319322

320323
for j, image_name in enumerate(batch_images):
321324
print(f"Preparing {image_name} for Label Studio import")
322-
data_dict = {'image': os.path.join("/data/local-files/?d=input/", os.path.basename(image_name))}
325+
data_dict = {'image': os.path.join("/data/local-files/?d=MillionTrees/input/", os.path.basename(image_name))}
323326

324327
if predictions is not None:
325328
prediction = predictions[i + j]

0 commit comments

Comments
 (0)