simplifying process, making small fixes

kylemann16 · kylemann16 · commit 26c80c28a5fe · 2025-09-07T18:29:40.000-05:00
diff --git a/src/silvimetric/cli/cli.py b/src/silvimetric/cli/cli.py
@@ -199,13 +199,6 @@ def info_cmd(
 @click.option(
     '--resolution', type=float, default=100, help='Summary pixel resolution'
 )
-@click.option(
-    '--filter_empty',
-    is_flag=True,
-    type=bool,
-    default=False,
-    help='Remove empty space in computation. Will take extra time.',
-)
 @click.option(
     '--point_count', type=int, default=600000, help='Point count threshold.'
 )
@@ -215,7 +208,7 @@ def info_cmd(
 )
 @click.pass_obj
 def scan_cmd(
-    app, resolution, point_count, pointcloud, bounds, depth, filter_empty
+    app, resolution, point_count, pointcloud, bounds, depth
 ):
     """Scan point cloud, output information on it, and determine the optimal
     tile size."""
@@ -233,7 +226,6 @@ def scan_cmd(
         point_count,
         resolution,
         depth,
-        filter_empty,
         log=app.log,
     )
 
diff --git a/src/silvimetric/commands/scan.py b/src/silvimetric/commands/scan.py
@@ -4,6 +4,7 @@
 import dask
 import math
 import json
+import itertools
 
 from dask.diagnostics import ProgressBar
 
@@ -17,7 +18,6 @@ def scan(
     point_count: int = 600000,
     resolution: float = 100,
     depth: int = 6,
-    filter_empty: bool = False,
     log: Log = None,
 ):
     """
@@ -29,7 +29,6 @@ def scan(
     :param point_count: Point count threshold., defaults to 600000
     :param resolution: Resolution threshold., defaults to 100
     :param depth: Tree depth threshold., defaults to 6
-    :param filter_empty: Remove empty Extents. This takes longer, but is more
     accurate., defaults to False
 
     :return: Returns list of point counts.
@@ -54,14 +53,9 @@ def scan(
             logger.info('Gathering initial chunks...')
             count = dask.delayed(data.estimate_count)(extents.bounds).persist()
 
-            if filter_empty:
-                chunks = extents.chunk(data, point_count)
-                cell_counts = [ch.cell_count for ch in chunks]
-
-            else:
-                cell_counts = extent_handle(
-                    extents, data, resolution, point_count, depth, log
-                )
+            cell_counts = extent_handle(
+                extents, data, resolution, point_count, depth, log
+            )
 
             num_cells = np.sum(cell_counts).item()
             std = np.std(cell_counts)
diff --git a/src/silvimetric/commands/shatter.py b/src/silvimetric/commands/shatter.py
@@ -4,20 +4,15 @@
 import copy
 from typing_extensions import Generator
 import pandas as pd
+import itertools
 
-from dask.distributed import (
-    as_completed,
-    futures_of,
-    CancelledError,
-    fire_and_forget,
-)
+from dask.distributed import CancelledError
 from distributed.client import _get_global_client as get_client
 
-from dask.delayed import Delayed, delayed
-import dask.array as da
+from dask.delayed import delayed
 import dask.bag as db
 from dask.diagnostics import ProgressBar
-from dask import persist, compute
+from dask import compute
 
 from .. import Extents, Storage, Data, ShatterConfig
 from ..resources.taskgraph import Graph
@@ -39,14 +34,15 @@ def get_data(extents: Extents, filename: str, storage: Storage):
     data.execute()
 
     points = p.get_dataframe(0)
-    points = points.loc[points.Y < extents.bounds.maxy]
-    points = points.loc[points.Y >= extents.bounds.miny]
-    points = points.loc[points.X >= extents.bounds.minx]
-    points = points.loc[points.X < extents.bounds.maxx, [*attrs, 'xi', 'yi']]
+    points = (points
+        .loc[points.Y < extents.bounds.maxy]
+        .loc[points.Y >= extents.bounds.miny]
+        .loc[points.X >= extents.bounds.minx]
+        .loc[points.X < extents.bounds.maxx, [*attrs, 'xi', 'yi']])
 
-    points.loc[:, 'xi'] = da.floor(points.xi)
+    points.loc[:, 'xi'] = np.floor(points.xi)
     # ceil for y because origin is at top left
-    points.loc[:, 'yi'] = da.ceil(points.yi)
+    points.loc[:, 'yi'] = np.ceil(points.yi)
     return points
 
 
@@ -74,10 +70,10 @@ def agg_list(data_in, proc_num):
 
     coerced = data_in.astype(col_dtypes | xyi_dtypes)
     gb = coerced.groupby(['xi', 'yi'], sort=False)
-    listed = gb.agg(lambda x: np.array(x, old_dtypes[x.name]))
     counts_df = gb[first_col_name].agg('count').rename('count')
-    listed = listed.join(counts_df)
-    listed = listed.assign(shatter_process_num=proc_num)
+    listed = (gb.agg(lambda x: np.array(x, old_dtypes[x.name]))
+              .join(counts_df)
+              .assign(shatter_process_num=proc_num))
 
     return listed
 
@@ -159,45 +155,49 @@ def kill_gracefully(signum, frame):
 
     signal.signal(signal.SIGINT, kill_gracefully)
 
-    # leaf_bag: db.Bag = db.from_sequence(leaves)
-    # processes = leaf_bag.map(do_one, config, storage)
-    processes = [delayed(do_one)(leaf, config, storage) for leaf in leaves]
-
     ## If dask is distributed, use the futures feature
     dc = get_client()
     consolidate_count = 30
-    count = 0
     if dc is not None:
-        pc_futures = futures_of(persist(processes))
-        for batch in as_completed(pc_futures, with_results=True).batches():
-            for _, pack in batch:
-                if isinstance(pack, CancelledError):
-                    continue
-                if isinstance(pack, int):
-                    pack = [pack]
-                for pc in pack:
-                    if isinstance(pc, BaseException):
-                        config.log.warning('Worker returned exception: ', pc)
-                    if isinstance(pc, int):
-                        count += 1
-                        if count >= consolidate_count:
-                            faf = dc.submit(
-                                storage.consolidate_shatter,
-                                timestamp=config.timestamp,
-                            )
-                            fire_and_forget(faf)
-                            count = 0
-                        config.point_count = config.point_count + pc
-                        del pc
+        processes = []
+        count = 0
+        for leaf_bunch in itertools.batched(leaves, consolidate_count):
+            count = count + 1
+            processes.append(dc.map(do_one, leaf_bunch, config=config, storage=storage))
+
+            processes.append(dc.submit(storage.consolidate_shatter, config.timestamp))
+        gathered = dc.gather(processes)
+        point_count = 0
+        for pc in gathered:
+            if pc is None:
+                continue
+            if isinstance(pc, int):
+                point_count = point_count + pc
+            elif isinstance(pc, BaseException):
+                config.log.warning(pc)
+            elif isinstance(pc, CancelledError):
+                config.log.warning(pc)
+            del pc
 
-        end_time = datetime.datetime.now().timestamp() * 1000
-        config.end_time = end_time
-        config.finished = True
-        point_count = config.point_count
     else:
         # Handle non-distributed dask scenarios
         with ProgressBar():
-            point_count = sum(*compute(processes))
+            count = 0
+            futures = []
+            for leaf in leaves:
+                count = count + 1
+                futures.append(delayed(do_one)(leaf, config, storage))
+                if count % consolidate_count == 0:
+                    futures.append(delayed(storage.consolidate_shatter)(timestamp=config.timestamp))
+
+            results = compute(*futures)
+            pcs = [possible_pc for possible_pc in results if possible_pc is not None]
+            point_count = sum(pcs)
+
+    end_time = datetime.datetime.now().timestamp() * 1000
+    config.end_time = end_time
+    config.finished = True
+    config.point_count = point_count
 
     return point_count
 
@@ -234,9 +234,9 @@ def shatter(config: ShatterConfig) -> int:
     if config.tile_size is not None:
         leaves = extents.get_leaf_children(config.tile_size)
     else:
-        chunks = extents.chunk(data, pc_threshold=600000)
-        leaves = db.from_sequence(chunks).compute()
+        leaves = extents.chunk(data)
 
+    leaves = itertools.chain(leaves)
     # Begin main operations
     config.log.debug('Fetching and arranging data...')
     storage.save_shatter_meta(config)
diff --git a/src/silvimetric/resources/extents.py b/src/silvimetric/resources/extents.py
@@ -149,7 +149,6 @@ def filter(
         :param depth: Current tree depth., defaults to 0
         :return: Returns a list of Extents.
         """
-
         pc = data.estimate_count(self.bounds)
 
         target_pc = pc_threshold
@@ -160,7 +159,6 @@ def filter(
             yield self
         else:
             # has it hit the threshold yet?
-            area = (maxx - minx) * (maxy - miny)
             next_split_x = (maxx - minx) / 2
             next_split_y = (maxy - miny) / 2
 
@@ -221,25 +219,8 @@ def split(self):
                 self.root,
             ),  # top right
         ]
-        return exts
-
+        yield from exts
 
-    def _find_dims(self, tile_size):
-        """
-        Find most square-like Extents given the number of cells per tile.
-
-        :param tile_size: Number of cells per tile.
-        :return: Returns x and y coordinates in a list.
-        """
-        s = math.sqrt(tile_size)
-        if int(s) == s:
-            return [s, s]
-        rng = np.arange(1, tile_size + 1, dtype=np.int32)
-        factors = rng[np.where(tile_size % rng == 0)]
-        idx = int((factors.size / 2) - 1)
-        x = factors[idx]
-        y = int(tile_size / x)
-        return [x, y]
 
     def get_leaf_children(self, tile_size):
         """
@@ -249,7 +230,8 @@ def get_leaf_children(self, tile_size):
         :yield: Yield from list of child extents.
         """
         res = self.resolution
-        xnum, ynum = self._find_dims(tile_size)
+        xnum = math.floor(math.sqrt(tile_size))
+        ynum = xnum
 
         local_xs = np.array(
             [
diff --git a/src/silvimetric/resources/metric.py b/src/silvimetric/resources/metric.py
@@ -148,7 +148,7 @@ def entry_name(self, attr: str) -> str:
         """Name for use in TileDB and extract file generation."""
         return f'm_{attr}_{self.name}'
 
-    def sanitize_and_run(self, d, locs, args):
+    def sanitize_and_run(self, d, locs, deps):
         """Sanitize arguments, find the indices"""
         # Args are the return values of previous DataFrame aggregations.
         # In order to access the correct location, we need a map of groupby
@@ -157,14 +157,14 @@ def sanitize_and_run(self, d, locs, args):
         attr = d.name
         attrs = [a.entry_name(attr) for a in self.dependencies]
 
-        if isinstance(args, pd.DataFrame):
+        if isinstance(deps, pd.DataFrame):
             idx = locs.loc[d.index[0]]
             xi = idx.xi
             yi = idx.yi
             pass_args = []
             for a in attrs:
                 try:
-                    arg = args.at[(yi, xi), a]
+                    arg = deps.at[(yi, xi), a]
                     if isinstance(arg, (list, tuple)):
                         pass_args.append(arg)
                     elif np.isnan(arg):
@@ -178,7 +178,7 @@ def sanitize_and_run(self, d, locs, args):
                     else:
                         raise (e)
         else:
-            pass_args = args
+            pass_args = deps
         a = self._method(d, *pass_args)
         return a
 
@@ -224,16 +224,13 @@ def merge(left, right):
         def runner(d, idx=idxer, m_args=merged_args):
             return self.sanitize_and_run(d, idx, m_args)
 
-        # def runner(values, index, m=merged_args):
-        #     return self._method(values, index, merged_args)
-
         # create map of current column name to tuple of new column name and
         # metric method
         cols = data.columns
         prev_cols = [col for col in cols if col not in idx]
         new_cols = {c: [(self.entry_name(c), runner)] for c in prev_cols}
 
-        val = gb.aggregate(new_cols)
+        val = gb.aggregate(new_cols, raw=True)
 
         # remove hierarchical columns
         val.columns = val.columns.droplevel(0)
diff --git a/src/silvimetric/resources/metrics/l_moments.py b/src/silvimetric/resources/metrics/l_moments.py
@@ -1,5 +1,6 @@
 from ..metric import Metric
 import numpy as np
+from numba import jit
 
 
 def lmom4(data, *args):
@@ -8,12 +9,11 @@ def lmom4(data, *args):
     Adapted from https://xiaoganghe.github.io/python-climate-visuals/chapters/data-analytics/scipy-basic.html
     """
 
-    data = data.values
-    n = len(data)
+    n = data.count()
     idx = np.arange(n)
 
     # sort in descending order
-    data = np.sort(data, kind='quickstort')[::-1]
+    data = np.sort(data)[::-1]
 
     b0 = data.mean()
     l1: float = b0
diff --git a/src/silvimetric/resources/metrics/p_moments.py b/src/silvimetric/resources/metrics/p_moments.py
@@ -1,12 +1,11 @@
 import numpy as np
 
 from ..metric import Metric
-import pdb
 
 def m_mean(data, *args):
-    m = data.mean()
-    if m.size == 0:
+    if not data.any():
         return np.nan
+    m = data.mean()
     return m
 
 
diff --git a/tests/test_data.py b/tests/test_data.py
diff --git a/tests/test_shatter.py b/tests/test_shatter.py