weecology
diff --git a/‎.bumpversion.cfg‎
Lines changed: 1 addition & 1 deletion b/‎.bumpversion.cfg‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.vscode/launch.json‎
Lines changed: 16 additions & 0 deletions b/‎.vscode/launch.json‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎current.png‎
-444 KB b/‎current.png‎
-444 KB
diff --git a/‎data_prep/SelvaBox.py‎
Lines changed: 43 additions & 12 deletions b/‎data_prep/SelvaBox.py‎
Lines changed: 43 additions & 12 deletions
diff --git a/‎data_prep/Troles_bamberg.py‎
Lines changed: 2 additions & 1 deletion b/‎data_prep/Troles_bamberg.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎data_prep/annotation_loop.py‎
Lines changed: 7 additions & 5 deletions b/‎data_prep/annotation_loop.py‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎data_prep/download_ofo_unsupervised.py‎
Lines changed: 10 additions & 10 deletions b/‎data_prep/download_ofo_unsupervised.py‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎data_prep/label_studio_utils.py‎
Lines changed: 8 additions & 5 deletions b/‎data_prep/label_studio_utils.py‎
Lines changed: 8 additions & 5 deletions
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.4.0
+current_version = 0.4.2
 commit = True
 tag = True
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)
 
@@ -51,6 +51,22 @@
             "cwd": "${workspaceFolder}/data_prep",
             "justMyCode": true,
         },
+        {
+            "name": "Debug test_release.py",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "pytest",
+            "args": [
+                "${workspaceFolder}/tests/test_release.py",
+                "-v"
+            ],
+            "console": "integratedTerminal",
+            "cwd": "${workspaceFolder}",
+            "env": {
+                "PYTHONPATH": "${workspaceFolder}:${env:PYTHONPATH}"
+            },
+            "justMyCode": false
+        }
 
     ]
 }
@@ -5,17 +5,31 @@
 from tqdm import tqdm
 from PIL import Image
 import json
-import ast
 from deepforest.utilities import read_file
 
-def download_selvabox():
-    """Download and process the SelvaBox dataset from HuggingFace"""
+def download_selvabox(force_download=False):
+    """Download and process the SelvaBox dataset from HuggingFace
+    
+    Args:
+        force_download (bool): If True, re-download parquet files even if cached
+    """
 
     # Create output directory (using standard MillionTrees path structure)
     output_dir = "/orange/ewhite/DeepForest/SelvaBox"
     images_dir = os.path.join(output_dir, "images")
+    cache_dir = os.path.join(output_dir, "cache")
+    annotations_csv = os.path.join(output_dir, "annotations.csv")
+    
+    # Check if dataset already exists locally
+    if not force_download and os.path.exists(annotations_csv) and os.path.exists(images_dir):
+        print(f"Dataset already exists at {output_dir}")
+        print(f"Found {len(os.listdir(images_dir))} images and annotations at {annotations_csv}")
+        print("Use force_download=True to re-download the dataset")
+        return annotations_csv
+    
     os.makedirs(output_dir, exist_ok=True)
     os.makedirs(images_dir, exist_ok=True)
+    os.makedirs(cache_dir, exist_ok=True)
 
     print("Downloading SelvaBox dataset from HuggingFace...")
 
@@ -36,10 +50,25 @@ def download_selvabox():
         split = file_info['split'] 
         parquet_url = file_info['url']
 
-        print(f"Processing {split} split from {parquet_url}")
+        # Cache parquet files locally
+        parquet_filename = os.path.basename(parquet_url.split('?')[0])  # Remove query params
+        cached_parquet_path = os.path.join(cache_dir, f"{split}_{parquet_filename}")
+        
+        # Download parquet file if not cached or if force_download is True
+        if force_download or not os.path.exists(cached_parquet_path):
+            print(f"Downloading {split} split parquet file...")
+            parquet_response = requests.get(parquet_url, stream=True)
+            parquet_response.raise_for_status()
+            
+            with open(cached_parquet_path, 'wb') as f:
+                for chunk in parquet_response.iter_content(chunk_size=8192):
+                    f.write(chunk)
+            print(f"Cached {split} split to {cached_parquet_path}")
+        else:
+            print(f"Using cached {split} split from {cached_parquet_path}")
 
-        # Read the parquet file directly from HuggingFace
-        df = pd.read_parquet(parquet_url)
+        # Read from cached file
+        df = pd.read_parquet(cached_parquet_path)
 
         print(f"Loaded {len(df)} rows from {split} split")
 
@@ -60,8 +89,12 @@ def download_selvabox():
 
                     image_path = os.path.join(images_dir, image_filename)
 
+                    # Skip saving if image already exists (unless force_download is True)
+                    if not force_download and os.path.exists(image_path):
+                        # Image exists, skip saving but continue to annotations
+                        pass
                     # Save image from bytes
-                    if isinstance(image_data, dict) and 'bytes' in image_data:
+                    elif isinstance(image_data, dict) and 'bytes' in image_data:
                         try:
                             image_bytes = image_data['bytes']
 
@@ -72,7 +105,6 @@ def download_selvabox():
 
                             # Convert to PNG and verify dimensions
                             with Image.open(temp_tif_path) as img:
-                                img_width, img_height = img.size
                                 # Convert to RGB if necessary
                                 if img.mode != 'RGB':
                                     img = img.convert('RGB')
@@ -159,15 +191,14 @@ def infer_split_from_filename(p: str):
     print(f"Annotation bounds - ymax: [{annotations_df['ymax'].min():.2f}, {annotations_df['ymax'].max():.2f}]")
 
     # Save annotations
-    output_csv = os.path.join(output_dir, "annotations.csv")
-    annotations_df.to_csv(output_csv, index=False)
-    print(f"Annotations saved to {output_csv}")
+    annotations_df.to_csv(annotations_csv, index=False)
+    print(f"Annotations saved to {annotations_csv}")
 
     # Show sample of the data
     print("\nSample annotations:")
     print(annotations_df.head())
 
-    return output_csv
+    return annotations_csv
 
 if __name__ == "__main__":
     download_selvabox()
@@ -64,7 +64,8 @@ def create_shapely_polygons_from_coco_segmentation_json(json_file):
 test2_polygons = create_shapely_polygons_from_coco_segmentation_json(test_set2)
 
 train_polygons["existing_split"] = "train"
-eval_polygons["existing_split"] = "eval"
+# The nomenclature is a bit confusing here, but val is the MillionTrees test set and test1 and test2 are the validation sets from the original paper
+eval_polygons["existing_split"] = "test"
 test1_polygons["existing_split"] = "test1"
 test2_polygons["existing_split"] = "test2"
 
 
@@ -94,7 +94,7 @@ def _build_preannotations(df: pd.DataFrame, dataset_type: str, images_root: Path
         preannotations.append(rows.reset_index(drop=True))
     return images, preannotations
 
-def upload_eval_splits(version: str, base_dir: str = "/orange/ewhite/web/public/MillionTrees/", num_images: int = 100) -> None:
+def upload_eval_splits(version: str, base_dir: str = "/orange/ewhite/web/public/MillionTrees/", num_images: int = 5) -> None:
     """Upload up to `num_images` test images per dataset/split with preannotations to Label Studio.
     
     Projects created: MillionTrees-Eval-<dataset_type>-<split>
@@ -108,11 +108,13 @@ def upload_eval_splits(version: str, base_dir: str = "/orange/ewhite/web/public/
     sftp_client = create_sftp_client(
         user=os.getenv("USER"),
         host=os.getenv("HOST"),
-        key_filename=os.path.expanduser(os.getenv("KEY_FILENAME"))
+        key_filename=os.path.expanduser(os.getenv("KEY"))
     )
 
-    for dataset_type in ("TreeBoxes", "TreePoints"):
-        for split_name in ("random", "zeroshot"):
+    #for dataset_type in ("TreeBoxes", "TreePoints"):
+       # for split_name in ("random", "zeroshot"):
+    for dataset_type in (["TreeBoxes"]):
+        for split_name in (["zeroshot"]):
             df = _load_test_records(base_dir, version, dataset_type, split_name)
             if df.empty:
                 print(f"No test records for {dataset_type} {split_name}, skipping.")
@@ -296,4 +298,4 @@ def main():
 
 if __name__ == "__main__":
     #main() 
-    upload_eval_splits(version="v0.9", base_dir="/orange/ewhite/web/public/MillionTrees/", num_images=100)
+    upload_eval_splits(version="v0.9", base_dir="/orange/ewhite/web/public/MillionTrees/", num_images=200)
@@ -17,7 +17,7 @@ def _read_geopandas(path: str):
         import geopandas as gpd  # type: ignore
     except ImportError as exc:
         raise ImportError(
-            "Optional dependency missing: 'geopandas'. Install with `pip install milliontrees[unsupervised]` or `pip install geopandas`."
+            "Optional dependency missing: 'geopandas'. Install with `pip install milliontrees[weak_supervised]` or `pip install geopandas`."
         ) from exc
     return gpd.read_file(path)
 
@@ -28,7 +28,7 @@ def _open_raster(path: str):
         from rasterio.windows import Window  # noqa: F401
     except ImportError as exc:
         raise ImportError(
-            "Optional dependency missing: 'rasterio'. Install with `pip install milliontrees[unsupervised]` or `pip install rasterio`."
+            "Optional dependency missing: 'rasterio'. Install with `pip install milliontrees[weak_supervised]` or `pip install rasterio`."
         ) from exc
     return __import__('rasterio').open(path)
 
@@ -77,7 +77,7 @@ def _require_requests():
         import requests  # type: ignore
     except ImportError as exc:
         raise ImportError(
-            "Optional dependency missing: 'requests'. Install with `pip install milliontrees[unsupervised]` or `pip install requests`."
+            "Optional dependency missing: 'requests'. Install with `pip install milliontrees[weak_supervised]` or `pip install requests`."
         ) from exc
     return __import__('requests')
 
@@ -236,7 +236,7 @@ def run(
     allow_empty: bool = False,
 ):
     """
-    Build an unsupervised OFO points parquet by tiling orthomosaics and mapping treetops points.
+    Build a weak supervised OFO points parquet by tiling orthomosaics and mapping treetops points.
 
     Args:
         parquet_path: Input parquet file to load annotations and get missions to download
@@ -249,7 +249,7 @@ def run(
 
     images_dir = os.path.join(milliontrees_image_dir, 'images')
     ensure_dir(images_dir)
-    out_dir = os.path.join(milliontrees_image_dir, 'unsupervised')
+    out_dir = os.path.join(milliontrees_image_dir, 'weak_supervised')
     ensure_dir(out_dir)
 
     # Read annotations parquet to get mission IDs
@@ -354,7 +354,7 @@ def run(
                             'filename': out_name,
                             'x': (pts_tile['x'] - x0).astype(int).values,
                             'y': (pts_tile['y'] - y0).astype(int).values,
-                            'source': 'OFO treetops unsupervised',
+                            'source': 'OFO treetops weak supervised',
                             'split': 'train',
                         })
                         records.append(df_tile)
@@ -364,7 +364,7 @@ def run(
                             'filename': [out_name],
                             'x': [np.nan],
                             'y': [np.nan],
-                            'source': ['OFO treetops unsupervised'],
+                            'source': ['OFO treetops weak supervised'],
                             'split': ['train'],
                         }))
         finally:
@@ -377,13 +377,13 @@ def run(
     df = pd.concat(records, ignore_index=True)
     # Remove NaN rows if any
     df = df.dropna(subset=['x', 'y'])
-    out_path = os.path.join(out_dir, "TreePoints_OFO_unsupervised.parquet")
+    out_path = os.path.join(out_dir, "TreePoints_OFO_weak_supervised.parquet")
     df.to_parquet(out_path, index=False)
-    print(f"Wrote OFO unsupervised points to {out_path}")
+    print(f"Wrote OFO weak supervised points to {out_path}")
 
 
 def parse_args():
-    parser = argparse.ArgumentParser(description='Build OFO unsupervised points dataset')
+    parser = argparse.ArgumentParser(description='Build OFO weak supervised points dataset')
     parser.add_argument('--data_dir', required=True, help='MillionTrees dataset directory')
     parser.add_argument('--patch_size', type=int, default=800)
     parser.add_argument('--allow_empty', action='store_true')
 
@@ -90,10 +90,8 @@ def label_studio_format(local_image_dir, preannotations, dataset_type):
                 "value": {
                     "x": float(row['x']/original_width*100),  # Ensure float
                     "y": float(row['y']/original_height*100),  # Ensure float
-                    "width": 1.0,  # Add fixed width for visibility
                     "keypointlabels": ["tree"]  # Use fixed label
                 },
-                "score": 1.0,
                 "to_name": "image",
                 "type": "keypointlabels",
                 "from_name": "label",
@@ -113,7 +111,6 @@ def label_studio_format(local_image_dir, preannotations, dataset_type):
                     "rotation": 0,
                     "rectanglelabels": ["tree"]  # Use fixed label
                 },
-                "score": 1.0,
                 "to_name": "image",
                 "type": "rectanglelabels",
                 "from_name": "label",
@@ -122,7 +119,13 @@ def label_studio_format(local_image_dir, preannotations, dataset_type):
             }
             results.append(result)
 
-    return {"result": results}
+    # Return a prediction object compatible with Label Studio expectations.
+    # Score belongs to the prediction, not individual results.
+    return {
+        "result": results,
+        "score": 1.0,
+        "model_version": "ground-truth"
+    }
 
 # check_if_complete label studio images are done
 def check_if_complete(annotations):
@@ -319,7 +322,7 @@ def import_image_tasks(label_studio_project, image_names, local_image_dir, datas
 
         for j, image_name in enumerate(batch_images):
             print(f"Preparing {image_name} for Label Studio import")
-            data_dict = {'image': os.path.join("/data/local-files/?d=input/", os.path.basename(image_name))}
+            data_dict = {'image': os.path.join("/data/local-files/?d=MillionTrees/input/", os.path.basename(image_name))}
 
             if predictions is not None:
                 prediction = predictions[i + j]