Skip to content

Commit 26c80c2

Browse files
committed
simplifying process, making small fixes
1 parent 546587d commit 26c80c2

File tree

9 files changed

+78
-111
lines changed

9 files changed

+78
-111
lines changed

src/silvimetric/cli/cli.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -199,13 +199,6 @@ def info_cmd(
199199
@click.option(
200200
'--resolution', type=float, default=100, help='Summary pixel resolution'
201201
)
202-
@click.option(
203-
'--filter_empty',
204-
is_flag=True,
205-
type=bool,
206-
default=False,
207-
help='Remove empty space in computation. Will take extra time.',
208-
)
209202
@click.option(
210203
'--point_count', type=int, default=600000, help='Point count threshold.'
211204
)
@@ -215,7 +208,7 @@ def info_cmd(
215208
)
216209
@click.pass_obj
217210
def scan_cmd(
218-
app, resolution, point_count, pointcloud, bounds, depth, filter_empty
211+
app, resolution, point_count, pointcloud, bounds, depth
219212
):
220213
"""Scan point cloud, output information on it, and determine the optimal
221214
tile size."""
@@ -233,7 +226,6 @@ def scan_cmd(
233226
point_count,
234227
resolution,
235228
depth,
236-
filter_empty,
237229
log=app.log,
238230
)
239231

src/silvimetric/commands/scan.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import dask
55
import math
66
import json
7+
import itertools
78

89
from dask.diagnostics import ProgressBar
910

@@ -17,7 +18,6 @@ def scan(
1718
point_count: int = 600000,
1819
resolution: float = 100,
1920
depth: int = 6,
20-
filter_empty: bool = False,
2121
log: Log = None,
2222
):
2323
"""
@@ -29,7 +29,6 @@ def scan(
2929
:param point_count: Point count threshold., defaults to 600000
3030
:param resolution: Resolution threshold., defaults to 100
3131
:param depth: Tree depth threshold., defaults to 6
32-
:param filter_empty: Remove empty Extents. This takes longer, but is more
3332
accurate., defaults to False
3433
3534
:return: Returns list of point counts.
@@ -54,14 +53,9 @@ def scan(
5453
logger.info('Gathering initial chunks...')
5554
count = dask.delayed(data.estimate_count)(extents.bounds).persist()
5655

57-
if filter_empty:
58-
chunks = extents.chunk(data, point_count)
59-
cell_counts = [ch.cell_count for ch in chunks]
60-
61-
else:
62-
cell_counts = extent_handle(
63-
extents, data, resolution, point_count, depth, log
64-
)
56+
cell_counts = extent_handle(
57+
extents, data, resolution, point_count, depth, log
58+
)
6559

6660
num_cells = np.sum(cell_counts).item()
6761
std = np.std(cell_counts)

src/silvimetric/commands/shatter.py

Lines changed: 51 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,15 @@
44
import copy
55
from typing_extensions import Generator
66
import pandas as pd
7+
import itertools
78

8-
from dask.distributed import (
9-
as_completed,
10-
futures_of,
11-
CancelledError,
12-
fire_and_forget,
13-
)
9+
from dask.distributed import CancelledError
1410
from distributed.client import _get_global_client as get_client
1511

16-
from dask.delayed import Delayed, delayed
17-
import dask.array as da
12+
from dask.delayed import delayed
1813
import dask.bag as db
1914
from dask.diagnostics import ProgressBar
20-
from dask import persist, compute
15+
from dask import compute
2116

2217
from .. import Extents, Storage, Data, ShatterConfig
2318
from ..resources.taskgraph import Graph
@@ -39,14 +34,15 @@ def get_data(extents: Extents, filename: str, storage: Storage):
3934
data.execute()
4035

4136
points = p.get_dataframe(0)
42-
points = points.loc[points.Y < extents.bounds.maxy]
43-
points = points.loc[points.Y >= extents.bounds.miny]
44-
points = points.loc[points.X >= extents.bounds.minx]
45-
points = points.loc[points.X < extents.bounds.maxx, [*attrs, 'xi', 'yi']]
37+
points = (points
38+
.loc[points.Y < extents.bounds.maxy]
39+
.loc[points.Y >= extents.bounds.miny]
40+
.loc[points.X >= extents.bounds.minx]
41+
.loc[points.X < extents.bounds.maxx, [*attrs, 'xi', 'yi']])
4642

47-
points.loc[:, 'xi'] = da.floor(points.xi)
43+
points.loc[:, 'xi'] = np.floor(points.xi)
4844
# ceil for y because origin is at top left
49-
points.loc[:, 'yi'] = da.ceil(points.yi)
45+
points.loc[:, 'yi'] = np.ceil(points.yi)
5046
return points
5147

5248

@@ -74,10 +70,10 @@ def agg_list(data_in, proc_num):
7470

7571
coerced = data_in.astype(col_dtypes | xyi_dtypes)
7672
gb = coerced.groupby(['xi', 'yi'], sort=False)
77-
listed = gb.agg(lambda x: np.array(x, old_dtypes[x.name]))
7873
counts_df = gb[first_col_name].agg('count').rename('count')
79-
listed = listed.join(counts_df)
80-
listed = listed.assign(shatter_process_num=proc_num)
74+
listed = (gb.agg(lambda x: np.array(x, old_dtypes[x.name]))
75+
.join(counts_df)
76+
.assign(shatter_process_num=proc_num))
8177

8278
return listed
8379

@@ -159,45 +155,49 @@ def kill_gracefully(signum, frame):
159155

160156
signal.signal(signal.SIGINT, kill_gracefully)
161157

162-
# leaf_bag: db.Bag = db.from_sequence(leaves)
163-
# processes = leaf_bag.map(do_one, config, storage)
164-
processes = [delayed(do_one)(leaf, config, storage) for leaf in leaves]
165-
166158
## If dask is distributed, use the futures feature
167159
dc = get_client()
168160
consolidate_count = 30
169-
count = 0
170161
if dc is not None:
171-
pc_futures = futures_of(persist(processes))
172-
for batch in as_completed(pc_futures, with_results=True).batches():
173-
for _, pack in batch:
174-
if isinstance(pack, CancelledError):
175-
continue
176-
if isinstance(pack, int):
177-
pack = [pack]
178-
for pc in pack:
179-
if isinstance(pc, BaseException):
180-
config.log.warning('Worker returned exception: ', pc)
181-
if isinstance(pc, int):
182-
count += 1
183-
if count >= consolidate_count:
184-
faf = dc.submit(
185-
storage.consolidate_shatter,
186-
timestamp=config.timestamp,
187-
)
188-
fire_and_forget(faf)
189-
count = 0
190-
config.point_count = config.point_count + pc
191-
del pc
162+
processes = []
163+
count = 0
164+
for leaf_bunch in itertools.batched(leaves, consolidate_count):
165+
count = count + 1
166+
processes.append(dc.map(do_one, leaf_bunch, config=config, storage=storage))
167+
168+
processes.append(dc.submit(storage.consolidate_shatter, config.timestamp))
169+
gathered = dc.gather(processes)
170+
point_count = 0
171+
for pc in gathered:
172+
if pc is None:
173+
continue
174+
if isinstance(pc, int):
175+
point_count = point_count + pc
176+
elif isinstance(pc, BaseException):
177+
config.log.warning(pc)
178+
elif isinstance(pc, CancelledError):
179+
config.log.warning(pc)
180+
del pc
192181

193-
end_time = datetime.datetime.now().timestamp() * 1000
194-
config.end_time = end_time
195-
config.finished = True
196-
point_count = config.point_count
197182
else:
198183
# Handle non-distributed dask scenarios
199184
with ProgressBar():
200-
point_count = sum(*compute(processes))
185+
count = 0
186+
futures = []
187+
for leaf in leaves:
188+
count = count + 1
189+
futures.append(delayed(do_one)(leaf, config, storage))
190+
if count % consolidate_count == 0:
191+
futures.append(delayed(storage.consolidate_shatter)(timestamp=config.timestamp))
192+
193+
results = compute(*futures)
194+
pcs = [possible_pc for possible_pc in results if possible_pc is not None]
195+
point_count = sum(pcs)
196+
197+
end_time = datetime.datetime.now().timestamp() * 1000
198+
config.end_time = end_time
199+
config.finished = True
200+
config.point_count = point_count
201201

202202
return point_count
203203

@@ -234,9 +234,9 @@ def shatter(config: ShatterConfig) -> int:
234234
if config.tile_size is not None:
235235
leaves = extents.get_leaf_children(config.tile_size)
236236
else:
237-
chunks = extents.chunk(data, pc_threshold=600000)
238-
leaves = db.from_sequence(chunks).compute()
237+
leaves = extents.chunk(data)
239238

239+
leaves = itertools.chain(leaves)
240240
# Begin main operations
241241
config.log.debug('Fetching and arranging data...')
242242
storage.save_shatter_meta(config)

src/silvimetric/resources/extents.py

Lines changed: 3 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,6 @@ def filter(
149149
:param depth: Current tree depth., defaults to 0
150150
:return: Returns a list of Extents.
151151
"""
152-
153152
pc = data.estimate_count(self.bounds)
154153

155154
target_pc = pc_threshold
@@ -160,7 +159,6 @@ def filter(
160159
yield self
161160
else:
162161
# has it hit the threshold yet?
163-
area = (maxx - minx) * (maxy - miny)
164162
next_split_x = (maxx - minx) / 2
165163
next_split_y = (maxy - miny) / 2
166164

@@ -221,25 +219,8 @@ def split(self):
221219
self.root,
222220
), # top right
223221
]
224-
return exts
225-
222+
yield from exts
226223

227-
def _find_dims(self, tile_size):
228-
"""
229-
Find most square-like Extents given the number of cells per tile.
230-
231-
:param tile_size: Number of cells per tile.
232-
:return: Returns x and y coordinates in a list.
233-
"""
234-
s = math.sqrt(tile_size)
235-
if int(s) == s:
236-
return [s, s]
237-
rng = np.arange(1, tile_size + 1, dtype=np.int32)
238-
factors = rng[np.where(tile_size % rng == 0)]
239-
idx = int((factors.size / 2) - 1)
240-
x = factors[idx]
241-
y = int(tile_size / x)
242-
return [x, y]
243224

244225
def get_leaf_children(self, tile_size):
245226
"""
@@ -249,7 +230,8 @@ def get_leaf_children(self, tile_size):
249230
:yield: Yield from list of child extents.
250231
"""
251232
res = self.resolution
252-
xnum, ynum = self._find_dims(tile_size)
233+
xnum = math.floor(math.sqrt(tile_size))
234+
ynum = xnum
253235

254236
local_xs = np.array(
255237
[

src/silvimetric/resources/metric.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ def entry_name(self, attr: str) -> str:
148148
"""Name for use in TileDB and extract file generation."""
149149
return f'm_{attr}_{self.name}'
150150

151-
def sanitize_and_run(self, d, locs, args):
151+
def sanitize_and_run(self, d, locs, deps):
152152
"""Sanitize arguments, find the indices"""
153153
# Args are the return values of previous DataFrame aggregations.
154154
# In order to access the correct location, we need a map of groupby
@@ -157,14 +157,14 @@ def sanitize_and_run(self, d, locs, args):
157157
attr = d.name
158158
attrs = [a.entry_name(attr) for a in self.dependencies]
159159

160-
if isinstance(args, pd.DataFrame):
160+
if isinstance(deps, pd.DataFrame):
161161
idx = locs.loc[d.index[0]]
162162
xi = idx.xi
163163
yi = idx.yi
164164
pass_args = []
165165
for a in attrs:
166166
try:
167-
arg = args.at[(yi, xi), a]
167+
arg = deps.at[(yi, xi), a]
168168
if isinstance(arg, (list, tuple)):
169169
pass_args.append(arg)
170170
elif np.isnan(arg):
@@ -178,7 +178,7 @@ def sanitize_and_run(self, d, locs, args):
178178
else:
179179
raise (e)
180180
else:
181-
pass_args = args
181+
pass_args = deps
182182
a = self._method(d, *pass_args)
183183
return a
184184

@@ -224,16 +224,13 @@ def merge(left, right):
224224
def runner(d, idx=idxer, m_args=merged_args):
225225
return self.sanitize_and_run(d, idx, m_args)
226226

227-
# def runner(values, index, m=merged_args):
228-
# return self._method(values, index, merged_args)
229-
230227
# create map of current column name to tuple of new column name and
231228
# metric method
232229
cols = data.columns
233230
prev_cols = [col for col in cols if col not in idx]
234231
new_cols = {c: [(self.entry_name(c), runner)] for c in prev_cols}
235232

236-
val = gb.aggregate(new_cols)
233+
val = gb.aggregate(new_cols, raw=True)
237234

238235
# remove hierarchical columns
239236
val.columns = val.columns.droplevel(0)

src/silvimetric/resources/metrics/l_moments.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from ..metric import Metric
22
import numpy as np
3+
from numba import jit
34

45

56
def lmom4(data, *args):
@@ -8,12 +9,11 @@ def lmom4(data, *args):
89
Adapted from https://xiaoganghe.github.io/python-climate-visuals/chapters/data-analytics/scipy-basic.html
910
"""
1011

11-
data = data.values
12-
n = len(data)
12+
n = data.count()
1313
idx = np.arange(n)
1414

1515
# sort in descending order
16-
data = np.sort(data, kind='quickstort')[::-1]
16+
data = np.sort(data)[::-1]
1717

1818
b0 = data.mean()
1919
l1: float = b0

src/silvimetric/resources/metrics/p_moments.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
import numpy as np
22

33
from ..metric import Metric
4-
import pdb
54

65
def m_mean(data, *args):
7-
m = data.mean()
8-
if m.size == 0:
6+
if not data.any():
97
return np.nan
8+
m = data.mean()
109
return m
1110

1211

0 commit comments

Comments
 (0)