55from tqdm import tqdm
66from PIL import Image
77import json
8- import ast
98from deepforest .utilities import read_file
109
11- def download_selvabox ():
12- """Download and process the SelvaBox dataset from HuggingFace"""
10+ def download_selvabox (force_download = False ):
11+ """Download and process the SelvaBox dataset from HuggingFace
12+
13+ Args:
14+ force_download (bool): If True, re-download parquet files even if cached
15+ """
1316
1417 # Create output directory (using standard MillionTrees path structure)
1518 output_dir = "/orange/ewhite/DeepForest/SelvaBox"
1619 images_dir = os .path .join (output_dir , "images" )
20+ cache_dir = os .path .join (output_dir , "cache" )
21+ annotations_csv = os .path .join (output_dir , "annotations.csv" )
22+
23+ # Check if dataset already exists locally
24+ if not force_download and os .path .exists (annotations_csv ) and os .path .exists (images_dir ):
25+ print (f"Dataset already exists at { output_dir } " )
26+ print (f"Found { len (os .listdir (images_dir ))} images and annotations at { annotations_csv } " )
27+ print ("Use force_download=True to re-download the dataset" )
28+ return annotations_csv
29+
1730 os .makedirs (output_dir , exist_ok = True )
1831 os .makedirs (images_dir , exist_ok = True )
32+ os .makedirs (cache_dir , exist_ok = True )
1933
2034 print ("Downloading SelvaBox dataset from HuggingFace..." )
2135
@@ -36,10 +50,25 @@ def download_selvabox():
3650 split = file_info ['split' ]
3751 parquet_url = file_info ['url' ]
3852
39- print (f"Processing { split } split from { parquet_url } " )
53+ # Cache parquet files locally
54+ parquet_filename = os .path .basename (parquet_url .split ('?' )[0 ]) # Remove query params
55+ cached_parquet_path = os .path .join (cache_dir , f"{ split } _{ parquet_filename } " )
56+
57+ # Download parquet file if not cached or if force_download is True
58+ if force_download or not os .path .exists (cached_parquet_path ):
59+ print (f"Downloading { split } split parquet file..." )
60+ parquet_response = requests .get (parquet_url , stream = True )
61+ parquet_response .raise_for_status ()
62+
63+ with open (cached_parquet_path , 'wb' ) as f :
64+ for chunk in parquet_response .iter_content (chunk_size = 8192 ):
65+ f .write (chunk )
66+ print (f"Cached { split } split to { cached_parquet_path } " )
67+ else :
68+ print (f"Using cached { split } split from { cached_parquet_path } " )
4069
41- # Read the parquet file directly from HuggingFace
42- df = pd .read_parquet (parquet_url )
70+ # Read from cached file
71+ df = pd .read_parquet (cached_parquet_path )
4372
4473 print (f"Loaded { len (df )} rows from { split } split" )
4574
@@ -60,8 +89,12 @@ def download_selvabox():
6089
6190 image_path = os .path .join (images_dir , image_filename )
6291
92+ # Skip saving if image already exists (unless force_download is True)
93+ if not force_download and os .path .exists (image_path ):
94+ # Image exists, skip saving but continue to annotations
95+ pass
6396 # Save image from bytes
64- if isinstance (image_data , dict ) and 'bytes' in image_data :
97+ elif isinstance (image_data , dict ) and 'bytes' in image_data :
6598 try :
6699 image_bytes = image_data ['bytes' ]
67100
@@ -72,7 +105,6 @@ def download_selvabox():
72105
73106 # Convert to PNG and verify dimensions
74107 with Image .open (temp_tif_path ) as img :
75- img_width , img_height = img .size
76108 # Convert to RGB if necessary
77109 if img .mode != 'RGB' :
78110 img = img .convert ('RGB' )
@@ -159,15 +191,14 @@ def infer_split_from_filename(p: str):
159191 print (f"Annotation bounds - ymax: [{ annotations_df ['ymax' ].min ():.2f} , { annotations_df ['ymax' ].max ():.2f} ]" )
160192
161193 # Save annotations
162- output_csv = os .path .join (output_dir , "annotations.csv" )
163- annotations_df .to_csv (output_csv , index = False )
164- print (f"Annotations saved to { output_csv } " )
194+ annotations_df .to_csv (annotations_csv , index = False )
195+ print (f"Annotations saved to { annotations_csv } " )
165196
166197 # Show sample of the data
167198 print ("\n Sample annotations:" )
168199 print (annotations_df .head ())
169200
170- return output_csv
201+ return annotations_csv
171202
172203if __name__ == "__main__" :
173204 download_selvabox ()
0 commit comments