allenai
diff --git a/‎data/forest_loss_driver/config_studio_annotation.json‎
Lines changed: 56 additions & 134 deletions b/‎data/forest_loss_driver/config_studio_annotation.json‎
Lines changed: 56 additions & 134 deletions
diff --git a/‎rslp/forest_loss_driver/scripts/add_area_to_studio_tasks.py‎
Lines changed: 8 additions & 6 deletions b/‎rslp/forest_loss_driver/scripts/add_area_to_studio_tasks.py‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎rslp/forest_loss_driver/scripts/peru_20260112/README.md‎
Lines changed: 87 additions & 0 deletions b/‎rslp/forest_loss_driver/scripts/peru_20260112/README.md‎
Lines changed: 87 additions & 0 deletions
diff --git a/‎rslp/forest_loss_driver/scripts/peru_20260112/add_label.py‎
Lines changed: 77 additions & 0 deletions b/‎rslp/forest_loss_driver/scripts/peru_20260112/add_label.py‎
Lines changed: 77 additions & 0 deletions
diff --git a/‎rslp/forest_loss_driver/scripts/peru_20260112/integrated_config.yaml‎
Lines changed: 17 additions & 0 deletions b/‎rslp/forest_loss_driver/scripts/peru_20260112/integrated_config.yaml‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎rslp/forest_loss_driver/scripts/peru_20260112/rename_tasks.py‎
Lines changed: 28 additions & 0 deletions b/‎rslp/forest_loss_driver/scripts/peru_20260112/rename_tasks.py‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎rslp/forest_loss_driver/scripts/peru_20260112/select_examples_for_annotation.py‎
Lines changed: 130 additions & 0 deletions b/‎rslp/forest_loss_driver/scripts/peru_20260112/select_examples_for_annotation.py‎
Lines changed: 130 additions & 0 deletions
@@ -20,7 +20,7 @@
 from rslearn.utils.geometry import STGeometry
 from rslearn.utils.get_utm_ups_crs import get_utm_ups_projection
 
-BASE_URL = "https://earth-system-studio.allen.ai/api/v1"
+BASE_URL = "https://olmoearth.allenai.org/api/v1"
 
 # Arbitrary user ID to save the annotation under.
 # This one is ES Studio User.
@@ -37,8 +37,10 @@
     # Get the annotation metadata field ID for the Area field.
     url = f"{BASE_URL}/projects/{project_id}"
     response = requests.get(url, headers=headers, timeout=10)
-    assert response.status_code == 200
-    project_data = response.json()
+    response.raise_for_status()
+    json_data = response.json()
+    assert len(json_data["records"]) == 1
+    project_data = json_data["records"][0]
     metadata_field_id = None
     for metadata_field in project_data["template"]["annotation_metadata_fields"]:
         if metadata_field["name"] != "Area":
@@ -50,13 +52,13 @@
     # Now iterate through tasks.
     url = f"{BASE_URL}/projects/{project_id}/tasks?limit=1000"
     response = requests.get(url, headers=headers, timeout=10)
-    assert response.status_code == 200
+    response.raise_for_status()
     item_list = response.json()["items"]
     for task in tqdm.tqdm(item_list):
         task_id = task["id"]
         url = f"{BASE_URL}/tasks/{task_id}/annotations"
         response = requests.get(url, headers=headers, timeout=10)
-        assert response.status_code == 200
+        response.raise_for_status()
         fc = response.json()
         if len(fc["features"]) != 1:
             continue
@@ -106,4 +108,4 @@
 
         url = f"{BASE_URL}/annotations/{annotation_id}"
         response = requests.put(url, json.dumps(post_data), headers=headers, timeout=10)
-        assert response.status_code == 200
+        response.raise_for_status()
@@ -0,0 +1,87 @@
+This project is for populating examples for new phase of Peru annotation.
+
+## Get Predictions
+
+First we get predictions in Peru for a five-year period. `integrated_config.yaml`
+contains the YAML config used for the integrated inference pipeline in
+olmoearth_projects:
+
+```
+python -m olmoearth_projects.main projects.forest_loss_driver.deploy integrated_pipeline --config ../rslearn_projects/rslp/forest_loss_driver/scripts/peru_20260112/integrated_config.yaml
+```
+
+We only need to run it up till it collects the events across the Studio jobs, we got
+this file:
+
+```
+/weka/dfive-default/rslearn-eai/datasets/forest_loss_driver/dataset_v1/peru_20260112/inference/dataset_20260109/events_from_studio_jobs.geojson
+```
+
+## Select Examples
+
+Then we select examples for annotation:
+
+```
+python rslp/forest_loss_driver/scripts/peru_20260112/select_examples_for_annotation.py
+```
+
+This script will read the events from the file above and write out an rslearn dataset
+here:
+
+```
+/weka/dfive-default/rslearn-eai/datasets/forest_loss_driver/dataset_v1/peru_20260112/rslearn_dataset_for_selected_events/
+```
+
+The rslearn dataset should be first created with config file from
+`data/forest_loss_driver/config_studio_annotation.json`.
+
+The selection is done by randomly sampling 100 forest loss events that were predicted
+as each of logging/burned/none/river/airstrip (500 total), and another 500 where the
+maximum probability is <0.5 (indicating the model was not confident).
+
+## Prepare and Materialize
+
+Make sure to set PLANET_API_KEY env var since it is used in the dataset config. Then:
+
+```
+rslearn dataset prepare --root /weka/dfive-default/rslearn-eai/datasets/forest_loss_driver/dataset_v1/peru_20260112/rslearn_dataset_for_selected_events/ --workers 128 --retry-max-attempts 10 --retry-backoff-seconds 5
+rslearn dataset materialize --root /weka/dfive-default/rslearn-eai/datasets/forest_loss_driver/dataset_v1/peru_20260112/rslearn_dataset_for_selected_events/ --workers 128 --retry-max-attempts 10 --retry-backoff-seconds 5 --ignore-errors
+```
+
+## Additional Steps
+
+Afterwards there are a few additional steps we need to do because we forgot to include
+it in the initial example selection script.
+
+First, rename the tasks so they have the format `[#113] 2024-05-13 at -8.9846, -76.7046 prediction:burned`:
+
+```
+python rslp/forest_loss_driver/scripts/peru_20260112/rename_tasks.py
+```
+
+Then, add the label layer (forest loss polygon):
+
+```
+python rslp/forest_loss_driver/scripts/peru_20260112/add_label.py
+```
+
+## Sync to Studio
+
+Copy to GCS:
+
+```
+gsutil -m rsync -r /weka/dfive-default/rslearn-eai/datasets/forest_loss_driver/dataset_v1/peru_20260112/rslearn_dataset_for_selected_events/ gs://ai2-rslearn-projects-data/datasets/forest_loss_driver/dataset_v1/peru_20260112/rslearn_dataset_for_selected_events/
+```
+
+Then make request to have it import the dataset (need to create project in Studio first):
+
+```
+curl https://olmoearth.allenai.org/api/v1/datasets/ingest --request PUT --header 'Content-Type: application/json' --header "Authorization: Bearer $STUDIO_API_TOKEN" --data '{"dataset_path": "gs://ai2-rslearn-projects-data/datasets/forest_loss_driver/dataset_v1/peru_20260112/rslearn_dataset_for_selected_events/", "project_id": "60e16f40-dbe8-4932-af1b-3f762572530d", "layer_source_names": {}, "prediction_layer_names": []}'
+```
+
+After the project is populated, copy the annotation metadata fields from another
+project (should have Confidence enum with High/Medium/Low and Area number with 0-9999)
+and use `../add_area_to_studio_tasks.py` to set the area in hectares for each polygon.
+
+At 2026-01-20 we sent the project to ACA and they are now looking at it, once
+annotation is completed we will need to look into retraining the model.
@@ -0,0 +1,77 @@
+"""Add the label polygon since we forgot to include it initially."""
+
+import multiprocessing
+from datetime import datetime, timedelta
+
+import tqdm
+from rasterio.crs import CRS
+from rslearn.dataset import Dataset
+from rslearn.utils.feature import Feature
+from rslearn.utils.geometry import Projection
+from rslearn.utils.grid_index import GridIndex
+from rslearn.utils.vector_format import GeojsonCoordinateMode, GeojsonVectorFormat
+from upath import UPath
+
+PREDICTION_FNAME = "/weka/dfive-default/rslearn-eai/datasets/forest_loss_driver/dataset_v1/peru_20260112/inference/dataset_20260109/events_from_studio_jobs.geojson"
+OUTPUT_DATASET_PATH = "/weka/dfive-default/rslearn-eai/datasets/forest_loss_driver/dataset_v1/peru_20260112/rslearn_dataset_for_selected_events/"
+NUM_WORKERS = 128
+
+# Web Mercator projection that all windows are in.
+PROJECTION = Projection(CRS.from_epsg(3857), 9.554628535647032, -9.554628535647032)
+
+
+def reproject_feature(feat: Feature) -> Feature:
+    """Helper function to re-project a feature to the WebMercator projection."""
+    return Feature(feat.geometry.to_projection(PROJECTION), feat.properties)
+
+
+if __name__ == "__main__":
+    multiprocessing.set_start_method("forkserver")
+
+    # Load features (predictions) and windows.
+    features = GeojsonVectorFormat().decode_from_file(UPath(PREDICTION_FNAME))
+    dataset = Dataset(UPath(OUTPUT_DATASET_PATH))
+    windows = dataset.load_windows(show_progress=True, workers=128)
+
+    # We need to find the feature that corresponds to each window so we can add it as
+    # the label layer. So we create a grid index over the features. We use Web Mercator
+    # for the grid index since the index needs everything in one projection.
+    p = multiprocessing.Pool(NUM_WORKERS)
+    reprojected_features = p.imap_unordered(reproject_feature, features)
+    grid_index = GridIndex(size=100)
+    for feat in tqdm.tqdm(
+        reprojected_features, desc="Creating grid index", total=len(features)
+    ):
+        grid_index.insert(feat.geometry.shp.bounds, feat)
+    p.close()
+
+    # Now iterate over windows and find the closest feature.
+    # We make sure that the dates line up.
+    for window in tqdm.tqdm(windows, desc="Adding labels"):
+        candidates: list[Feature] = grid_index.query(window.bounds)
+        best_feat = None
+        best_distance: int | None = None
+        for candidate in candidates:
+            candidate_point = candidate.geometry.to_projection(PROJECTION).shp.centroid
+            distance = window.get_geometry().shp.centroid.distance(candidate_point)
+            if best_distance is None or distance < best_distance:
+                best_feat = candidate
+                best_distance = distance
+
+        # The rslearn windows were created using select_examples_for_annotation.py
+        # based on the centroid of the GeoJSON featuers, so if there is large distance
+        # then it must mean we matched to the wrong feature.
+        if best_feat is None or best_distance is None or best_distance > 10:
+            raise ValueError(f"no spatially matching feature for window {window.name}")
+
+        feat_datetime = datetime.fromisoformat(best_feat.properties["oe_start_time"])
+        if abs(feat_datetime - window.time_range[0]) > timedelta(days=1):
+            raise ValueError(f"no tempoarlly matching feature for window {window.name}")
+
+        layer_dir = window.get_layer_dir("label")
+        # Reset the label so it is marked unlabeled.
+        best_feat.properties["new_label"] = "unlabeled"
+        GeojsonVectorFormat(coordinate_mode=GeojsonCoordinateMode.WGS84).encode_vector(
+            layer_dir, [best_feat]
+        )
+        window.mark_layer_completed("label")
@@ -0,0 +1,17 @@
+integrated_config:
+  weka_base_dir: "/weka/dfive-default/rslearn-eai/datasets/forest_loss_driver/dataset_v1/peru_20260112/inference/"
+  gcs_base_dir: "gs://ai2-rslearn-projects-data/forest_loss_driver/dataset_v1/peru_20260112/inference/"
+  extract_alerts_args:
+    gcs_tiff_filenames:
+      - "070W_10S_060W_00N.tif"
+      - "070W_20S_060W_10S.tif"
+      - "080W_10S_070W_00N.tif"
+      - "080W_20S_070W_10S.tif"
+    out_fname: "placeholder"
+    country_data_path: "/weka/dfive-default/rslearn-eai/artifacts/natural_earth_countries/20240830/ne_10m_admin_0_countries.shp"
+    countries: ["PE"]
+    days: 1825
+    max_number_of_events: 200000
+  asset_workers: 128
+  make_tiles_workers: 128
+  write_individual_events_workers: 128
@@ -0,0 +1,28 @@
+"""We initially named the tasks differently so we rename it to better format."""
+
+import random
+import shutil
+
+import tqdm
+from rslearn.dataset.dataset import Dataset, Window
+from upath import UPath
+
+DATASET_PATH = "/weka/dfive-default/rslearn-eai/datasets/forest_loss_driver/dataset_v1/peru_20260112/rslearn_dataset_for_selected_events/"
+
+
+if __name__ == "__main__":
+    ds_path = UPath(DATASET_PATH)
+    dataset = Dataset(ds_path)
+    windows = dataset.load_windows()
+    random.shuffle(windows)
+    for idx, window in enumerate(tqdm.tqdm(windows)):
+        src_name = window.name
+        _, lon_str, lat_str, predicted_category = src_name.split("_")
+        date_time_str = window.time_range[0].strftime("%Y-%m-%d")
+        dst_name = f"[#{idx+1:04d}] {date_time_str} at {float(lat_str):.04f}, {float(lon_str):.04f} prediction:{predicted_category}"
+        shutil.move(
+            Window.get_window_root(ds_path, window.group, src_name),
+            Window.get_window_root(ds_path, window.group, dst_name),
+        )
+        window.name = dst_name
+        window.save()
@@ -0,0 +1,130 @@
+"""Select examples for this new Peru annotation.
+
+Based on predictions in Peru over five-year period:
+- Select 100 for each of logging/burned/none/river/airstrip
+- Select 500 from other categories where max(probs) < 0.5
+"""
+
+import random
+from datetime import datetime
+
+from rasterio.crs import CRS
+from rslearn.const import WGS84_PROJECTION
+from rslearn.dataset import Dataset, Window
+from rslearn.utils.feature import Feature
+from rslearn.utils.geometry import Projection
+from rslearn.utils.grid_index import GridIndex
+from rslearn.utils.vector_format import GeojsonVectorFormat
+from upath import UPath
+
+PREDICTION_FNAME = "/weka/dfive-default/rslearn-eai/datasets/forest_loss_driver/dataset_v1/peru_20260112/inference/dataset_20260109/events_from_studio_jobs.geojson"
+OUTPUT_DATASET_PATH = "/weka/dfive-default/rslearn-eai/datasets/forest_loss_driver/dataset_v1/peru_20260112/rslearn_dataset_for_selected_events/"
+TARGET_GROUP = "20260112_peru"
+RARE_CATEGORIES = ["logging", "burned", "none", "river", "airstrip"]
+PROB_THRESHOLD = 0.5
+DISTANCE_THRESHOLD = 1000 / 111111
+WINDOW_SIZE = 128
+
+
+if __name__ == "__main__":
+    # Load predictions.
+    predictions = GeojsonVectorFormat().decode_from_file(UPath(PREDICTION_FNAME))
+
+    # Create candidates for the different selection criteria.
+    by_class_options: dict[str, list[Feature]] = {
+        category: [] for category in RARE_CATEGORIES
+    }
+    by_prob_options: list[Feature] = []
+    for feat in predictions:
+        category = feat.properties["new_label"]
+        if category in RARE_CATEGORIES:
+            by_class_options[category].append(feat)
+        elif max(feat.properties["probs"]) < PROB_THRESHOLD:
+            by_prob_options.append(feat)
+
+    for category, candidates in by_class_options.items():
+        print(f"got {len(candidates)} options by class for category={category}")
+    print(f"got {len(by_prob_options)} options by prob")
+
+    # Select windows, we make sure their center points are at least 500 m away from
+    # each other.
+    grid_index = GridIndex(size=DISTANCE_THRESHOLD)
+    selected: list[Feature] = []
+
+    def contains_bbox(box: tuple[float, float, float, float]) -> bool:
+        """Check whether the box intersects a point in grid_index."""
+        for other in grid_index.query(box):
+            if (
+                other[0] > box[0]
+                and other[1] > box[1]
+                and other[0] < box[2]
+                and other[1] < box[3]
+            ):
+                return True
+        return False
+
+    def add_random_sample_of_features(features: list[Feature], max_count: int) -> int:
+        """Add a random sample of windows from the list to the selected set."""
+        # Add up to max_count from the features list.
+        random.shuffle(features)
+        cur_selected: list[Feature] = []
+        for feat in features:
+            center_point = feat.geometry.to_projection(WGS84_PROJECTION).shp.centroid
+            if contains_bbox(
+                (
+                    center_point.x - DISTANCE_THRESHOLD,
+                    center_point.y - DISTANCE_THRESHOLD,
+                    center_point.x + DISTANCE_THRESHOLD,
+                    center_point.y + DISTANCE_THRESHOLD,
+                )
+            ):
+                continue
+
+            cur_selected.append(feat)
+            grid_index.insert(
+                (center_point.x, center_point.y, center_point.x, center_point.y),
+                (center_point.x, center_point.y),
+            )
+            if len(cur_selected) >= max_count:
+                break
+
+        selected.extend(cur_selected)
+        return len(cur_selected)
+
+    for category, candidates in by_class_options.items():
+        count = add_random_sample_of_features(candidates, 100)
+        print(f"by class category={category} picked {count}/{len(candidates)} windows")
+    count = add_random_sample_of_features(by_prob_options, 500)
+    print(f"by prob picked {count}/{len(by_prob_options)} windows")
+    print(f"got {len(selected)} total to remap")
+
+    # Create windows in the destination dataset for these features.
+    dataset = Dataset(UPath(OUTPUT_DATASET_PATH))
+    dst_proj = Projection(CRS.from_epsg(3857), 9.554628535647032, -9.554628535647032)
+    random.shuffle(selected)
+    for idx, feat in enumerate(selected):
+        wgs84_geom = feat.geometry.to_projection(WGS84_PROJECTION)
+        lon = wgs84_geom.shp.centroid.x
+        lat = wgs84_geom.shp.centroid.y
+        predicted_category = feat.properties["new_label"]
+        window_name = f"[#{idx}]_{lon:.04f}_{lat:.04f}_predicted:{predicted_category}"
+
+        # Get bounds in our WebMercator projection.
+        dst_geom = feat.geometry.to_projection(dst_proj)
+        dst_bounds = (
+            int(dst_geom.shp.centroid.x) - WINDOW_SIZE // 2,
+            int(dst_geom.shp.centroid.y) - WINDOW_SIZE // 2,
+            int(dst_geom.shp.centroid.x) + WINDOW_SIZE // 2,
+            int(dst_geom.shp.centroid.y) + WINDOW_SIZE // 2,
+        )
+
+        ts = datetime.fromisoformat(feat.properties["oe_start_time"])
+        window = Window(
+            storage=dataset.storage,
+            group=TARGET_GROUP,
+            name=window_name,
+            projection=dst_proj,
+            bounds=dst_bounds,
+            time_range=(ts, ts),
+        )
+        window.save()