fixed scaling of masks

EliHei2 · EliHei2 · commit b210ce4c5b0f · 2025-05-22T15:48:10.000+02:00
diff --git a/src/segger/data/parquet/_settings/merscope.yaml b/src/segger/data/parquet/_settings/merscope.yaml
@@ -1,42 +1,43 @@
 transcripts:
-    filename: "transcripts.parquet"
-    x: "global_x"
-    y: "global_y"
-    z: "global_z"
-    id: "transcript_id"
-    label: "gene"
-    nuclear_column: "CellComp"
-    nuclear_value: "Nuclear"
-    filter_substrings:
-      - "NegPrb_"
-      - "SystemControl"
-      - "Negative"
-    xy:
-      - "global_x"
-      - "global_y"
-    xyz:
-      - "global_x"
-      - "global_y"
-      - "global_z"
-    columns:
-      - "global_x"
-      - "global_y"
-      - "global_z"
-      - "gene"
-      - "cell_id"
-      - "CellComp"
-      - "transcript_id"
+  filename: "detected_transcripts.parquet"
+  x: "global_x"
+  y: "global_y"
+  z: "global_z"
+  id: "transcript_id"
+  label: "gene"
+  nuclear_column: "overlaps_nucleus"
+  nuclear_value: 1
+  filter_substrings:
+    - "Blank-"
+    - "BLANK"
+  xy:
+    - "global_x"
+    - "global_y"
+  xyz:
+    - "global_x"
+    - "global_y"
+    - "global_z"
+  columns:
+    - "global_x"
+    - "global_y"
+    - "global_z"
+    - "gene"
+    - "cell_id"
+    - "overlaps_nucleus"
+    - "transcript_id"
 
 boundaries:
-  filename: "nucleus_boundaries.parquet"
-    x: "global_x"
-    y: "global_y"
-  id: "cell"
-  label: "cell"
+  filename: "cellpose_nucleus_micron_space.parquet"
+  geometry: "Geometry"
+  id: "EntityID"
+  label: "EntityID"
+  x: "centroid_x"
+  y: "centroid_y"
   xy:
-    - "x_global_px"
-    - "y_global_px"
+    - "centroid_x"
+    - "centroid_y"
   columns:
-    - "x_global_px"
-    - "y_global_px"
-    - "cell"
+    - "Geometry"
+    - "EntityID"
+    - "centroid_x"
+    - "centroid_y"
diff --git a/src/segger/data/parquet/_utils.py b/src/segger/data/parquet/_utils.py
@@ -1,6 +1,7 @@
 import pandas as pd
 import geopandas as gpd
 import shapely
+from shapely.affinity import scale
 from pyarrow import parquet as pq
 import numpy as np
 import scipy as sp
@@ -127,11 +128,22 @@ def read_parquet_region(
 
     columns = list({x, y} | set(extra_columns))
 
-    region = pd.read_parquet(
-        filepath,
-        filters=filters,
-        columns=columns,
-    )
+    # Check if 'Geometry', 'geometry', 'polygon', or 'Polygon' is in the columns
+    if any(col in columns for col in ['Geometry', 'geometry', 'polygon', 'Polygon']):
+        import geopandas as gpd
+        # If geometry columns are present, read with geopandas
+        region = gpd.read_parquet(
+            filepath,
+            filters=filters,
+            columns=columns,
+        )
+    else:
+        # Otherwise, read with pandas
+        region = pd.read_parquet(
+            filepath,
+            filters=filters,
+            columns=columns,
+        )
     return region
 
 
@@ -140,7 +152,7 @@ def get_polygons_from_xy(
     x: str,
     y: str,
     label: str,
-    buffer_ratio: float = 1.0,
+    scale_factor: float = 1.0,
 ) -> gpd.GeoSeries:
     """
     Convert boundary coordinates from a DataFrame to a GeoSeries of polygons.
@@ -156,8 +168,8 @@ def get_polygons_from_xy(
         The name of the column representing the y-coordinate.
     label : str
         The name of the column representing the cell or nucleus label.
-    buffer_ratio : float, optional
-        A ratio to expand or shrink the polygons. A value of 1.0 means no change,
+    scale_factor : float, optional
+        A ratio to scale the polygons. A value of 1.0 means no change,
         greater than 1.0 expands the polygons, and less than 1.0 shrinks the polygons
         (default is 1.0).
 
@@ -181,19 +193,18 @@ def get_polygons_from_xy(
     )
     gs = gpd.GeoSeries(polygons, index=np.unique(ids))
 
-    if buffer_ratio != 1.0:
-        # Calculate buffer distance based on polygon area
-        areas = gs.area
-        # Use the square root of the area to get a linear distance
-        buffer_distances = np.sqrt(areas / np.pi) * (buffer_ratio - 1.0)
-        # Apply buffer to each polygon with its specific distance
+    # print(gs)
+
+    if scale_factor != 1.0:
+        # Scale polygons around their centroid
         gs = gpd.GeoSeries(
             [
-                geom.buffer(dist) if dist != 0 else geom
-                for geom, dist in zip(gs, buffer_distances)
+                scale(geom, xfact=scale_factor, yfact=scale_factor, origin='centroid')
+                for geom in gs
             ],
             index=gs.index,
         )
+        # print(gs)
 
     return gs
 
diff --git a/src/segger/data/parquet/sample.py b/src/segger/data/parquet/sample.py
@@ -35,7 +35,7 @@ def __init__(
         self,
         base_dir: os.PathLike,
         n_workers: Optional[int] = 1,
-        buffer_ratio: Optional[float] = 1.0,
+        scale_factor: Optional[float] = 1.0,
         sample_type: str = None,
         weights: pd.DataFrame = None,
     ):
@@ -52,8 +52,8 @@ def __init__(
             The sample type of the raw data, e.g., 'xenium' or 'merscope'.
         weights : Optional[pd.DataFrame], default None
             DataFrame containing weights for transcript embedding.
-        buffer_ratio : Optional[float], default None
-            The buffer ratio to be used for expanding the boundary extents
+        scale_factor : Optional[float], default None
+            The scale factor to be used for expanding the boundary extents
             during spatial queries. If not provided, the default from settings
             will be used.
 
@@ -71,15 +71,15 @@ def __init__(
         boundaries_fn = self.settings.boundaries.filename
         self._boundaries_filepath = self._base_dir / boundaries_fn
         self.n_workers = n_workers
-        self.settings.boundaries.buffer_ratio = 1
+        self.settings.boundaries.scale_factor = 1
         nuclear_column = getattr(self.settings.transcripts, "nuclear_column", None)
-        if nuclear_column is None or self.settings.boundaries.buffer_ratio != 1.0:
+        if nuclear_column is None or self.settings.boundaries.scale_factor != 1.0:
             print(
                 "Boundary-transcript overlap information has not been pre-computed. It will be calculated during tile generation."
             )
-        # Set buffer ratio if provided
-        if buffer_ratio != 1.0:
-            self.settings.boundaries.buffer_ratio = buffer_ratio
+        # Set scale factor if provided
+        if scale_factor != 1.0:
+            self.settings.boundaries.scale_factor = scale_factor
 
         # Ensure transcript IDs exist
         utils.ensure_transcript_ids(
@@ -1164,13 +1164,12 @@ def get_boundary_props(
         of the code.
         """
         # Get polygons from coordinates
-        polygons = utils.get_polygons_from_xy(
-            self.boundaries,
-            x=self.settings.boundaries.x,
-            y=self.settings.boundaries.y,
-            label=self.settings.boundaries.label,
-            buffer_ratio=self.settings.boundaries.buffer_ratio,
-        )
+        # Use getattr to check for the geometry column
+        geometry_column = getattr(self.settings.boundaries, 'geometry', None)
+        if geometry_column and geometry_column in self.boundaries.columns:
+            polygons = self.boundaries[geometry_column]
+        else:
+            polygons = self.boundaries['geometry']  # Assign None if the geometry column does not exist
         # Geometric properties of polygons
         props = self.get_polygon_props(polygons)
         props = torch.as_tensor(props.values).float()
@@ -1230,13 +1229,22 @@ def to_pyg_dataset(
         pyg_data["tx", "neighbors", "tx"].edge_index = nbrs_edge_idx
 
         # Set up Boundary nodes
-        polygons = utils.get_polygons_from_xy(
-            self.boundaries,
-            self.settings.boundaries.x,
-            self.settings.boundaries.y,
-            self.settings.boundaries.label,
-            self.settings.boundaries.buffer_ratio,
-        )
+        # Check if boundaries have geometries
+        geometry_column = getattr(self.settings.boundaries, 'geometry', None)
+        if geometry_column and geometry_column in self.boundaries.columns:
+            polygons = gpd.GeoSeries(self.boundaries[geometry_column], index=self.boundaries.index)
+        else:
+            # Fallback: compute polygons
+            polygons = utils.get_polygons_from_xy(
+                self.boundaries,
+                x=self.settings.boundaries.x,
+                y=self.settings.boundaries.y,
+                label=self.settings.boundaries.label,
+                scale_factor=self.settings.boundaries.scale_factor,
+            )
+
+        # Ensure self.boundaries is a GeoDataFrame with correct geometry
+        self.boundaries = gpd.GeoDataFrame(self.boundaries.copy(), geometry=polygons)
         centroids = polygons.centroid.get_coordinates()
         pyg_data["bd"].id = polygons.index.to_numpy()
         pyg_data["bd"].pos = torch.tensor(centroids.values, dtype=torch.float32)
@@ -1273,7 +1281,7 @@ def to_pyg_dataset(
         nuclear_column = getattr(self.settings.transcripts, "nuclear_column", None)
         nuclear_value = getattr(self.settings.transcripts, "nuclear_value", None)
 
-        if nuclear_column is None or self.settings.boundaries.buffer_ratio != 1.0:
+        if nuclear_column is None or self.settings.boundaries.scale_factor != 1.0:
             is_nuclear = utils.compute_nuclear_transcripts(
                 polygons=polygons,
                 transcripts=self.transcripts,