Skip to content

Commit 895edb5

Browse files
authored
Update summary dataframe and data list (#41)
1 parent ee5aa1d commit 895edb5

File tree

6 files changed

+221
-752
lines changed

6 files changed

+221
-752
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ options:
3030
--timeout TIMEOUT Timeout limit in milliseconds
3131
--detect-provider Detect provider
3232
--approach APPROACH Approach to use. Must be one of: ['dynamic-client']
33-
--dataset DATASET dataset name. Must be one of: ['pyramids-v3-sharded-4326-1MB', 'pyramids-v3-sharded-4326-5MB']
33+
--dataset DATASET dataset name.
3434
--variable VARIABLE Zarr version. Must be one of: ['tasmax']
3535
--non-headless Run in non-headless mode
3636
--s3-bucket S3_BUCKET

binder/environment.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,4 @@ dependencies:
2222
- cloud-detect
2323
- opencv-python-headless
2424
- git+https://github.com/carbonplan/benchmark-maps
25+
- git+https://github.com/scalableminds/zarrita.git@b88ae5ca02917e24e716855f15ffaf020b95bb1a

carbonplan_benchmarks/analysis/processing.py

Lines changed: 93 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,12 @@
55
import fsspec
66
import numpy as np
77
import pandas as pd
8+
import zarrita
89

910
from .parsing import extract_event_type, extract_frame_data, extract_request_data
1011

1112
pd.options.plotting.backend = 'holoviews'
13+
pd.options.mode.chained_assignment = None
1214

1315

1416
def base64_to_img(base64jpeg):
@@ -30,7 +32,7 @@ def base64_to_img(base64jpeg):
3032
return cv.imdecode(arr, cv.IMREAD_COLOR)
3133

3234

33-
def calculate_snapshot_rmse(*, trace_events, snapshots, metadata):
35+
def calculate_snapshot_rmse(*, trace_events, snapshots, metadata, xstart: int = 133):
3436
"""
3537
Extract screenshots from a list of Chromium trace events.
3638
@@ -57,7 +59,7 @@ def calculate_rmse(predictions, targets):
5759
var = f'rmse_snapshot_{zoom_level}'
5860
for ind, row in screenshots.iterrows():
5961
frame = base64_to_img(row['args.snapshot'])
60-
screenshots.loc[ind, var] = calculate_rmse(frame, snapshot)
62+
screenshots.loc[ind, var] = calculate_rmse(frame[:, xstart:], snapshot[:, xstart:])
6163
return screenshots
6264

6365

@@ -118,6 +120,28 @@ def load_data(*, metadata_path: str, run: int):
118120
metadata['full_trace_path'] = trace_path
119121
with fs.open(trace_path) as f:
120122
trace_events = json.loads(f.read())['traceEvents']
123+
event_types = [
124+
'ResourceSendRequest',
125+
'ResourceFinish',
126+
'BeginFrame',
127+
'DrawFrame',
128+
'DroppedFrame',
129+
'Commit',
130+
'Screenshot',
131+
'benchmark-initial-load:start',
132+
'benchmark-initial-load:end',
133+
'benchmark-zoom_in-level-0:start'
134+
'benchmark-zoom_in-level-1:start'
135+
'benchmark-zoom_in-level-2:start'
136+
'benchmark-zoom_in-level-0:end'
137+
'benchmark-zoom_in-level-1:end'
138+
'benchmark-zoom_in-level-2:end',
139+
]
140+
trace_events = [
141+
event
142+
for event in trace_events
143+
if event['name'] in event_types or 'benchmark-zoom' in event['name']
144+
]
121145
return metadata, trace_events
122146

123147

@@ -141,16 +165,60 @@ def load_snapshots(*, snapshot_path: str):
141165
return snapshots
142166

143167

168+
def get_chunk_size(URI, zarr_version, sharded, var='tasmax'):
169+
"""
170+
Get chunk size based on zoom level 0.
171+
"""
172+
source_store = zarrita.RemoteStore(URI)
173+
if zarr_version == 2:
174+
source_array = zarrita.ArrayV2.open(source_store / '0' / var)
175+
chunks = source_array.metadata.chunks
176+
itemsize = source_array.metadata.dtype.itemsize
177+
else:
178+
source_array = zarrita.Array.open(source_store / '0' / var)
179+
if sharded:
180+
chunks = source_array.metadata.codecs[0].configuration.chunk_shape
181+
else:
182+
chunks = source_array.metadata.chunk_grid.configuration.chunk_shape
183+
itemsize = source_array.metadata.dtype.itemsize
184+
chunk_size = np.prod(chunks) * itemsize * 1e-6
185+
return chunk_size
186+
187+
188+
def add_chunk_size(
189+
summary: pd.DataFrame,
190+
*,
191+
root_path: str = 's3://carbonplan-benchmarks/data/NEX-GDDP-CMIP6/ACCESS-CM2/historical/r1i1p1f1/tasmax/tasmax_day_ACCESS-CM2_historical_r1i1p1f1_gn',
192+
):
193+
"""
194+
Add a column to the summary DataFrame containing the chunk size.
195+
"""
196+
datasets = summary[
197+
['zarr_version', 'dataset', 'shard_size', 'target_chunk_size']
198+
].drop_duplicates()
199+
datasets['URI'] = root_path + '/' + datasets['dataset']
200+
datasets['actual_chunk_size'] = datasets.apply(
201+
lambda x: get_chunk_size(x['URI'], x['zarr_version'], x['shard_size']), axis=1
202+
)
203+
datasets = datasets[['dataset', 'actual_chunk_size']]
204+
return summary.set_index('dataset').join(datasets.set_index('dataset'))
205+
206+
144207
def create_summary(*, metadata: pd.DataFrame, data: dict, url_filter: str = None):
145208
"""
146209
Create summary DataFrame for a given run
147210
"""
148-
metadata
149211
summary = pd.concat(
150212
[pd.DataFrame(metadata, index=[0])] * (metadata['zoom_level'] + 1), ignore_index=True
151213
)
214+
summary['metadata_path'] = metadata['metadata_path']
215+
summary['trace_path'] = metadata['trace_path']
152216
summary['zarr_version'] = summary['dataset'].apply(lambda x: int(x.split('-')[1][1]))
153-
summary['chunk_size'] = summary['dataset'].apply(lambda x: int(x.split('-')[5]))
217+
summary['projection'] = summary['dataset'].apply(lambda x: int(x.split('-')[2]))
218+
summary['pixels_per_tile'] = summary['dataset'].apply(lambda x: int(x.split('-')[4]))
219+
summary['target_chunk_size'] = summary['dataset'].apply(lambda x: int(x.split('-')[5]))
220+
summary['shard_orientation'] = summary['dataset'].apply(lambda x: x.split('-')[6])
221+
summary['shard_size'] = summary['dataset'].apply(lambda x: int(x.split('-')[7]))
154222
frames_data = data['frames_data']
155223
request_data = data['request_data']
156224

@@ -164,16 +232,30 @@ def create_summary(*, metadata: pd.DataFrame, data: dict, url_filter: str = None
164232
(request_data['request_start'] > actions.loc[zoom, 'start_time'])
165233
& (request_data['request_start'] <= actions.loc[zoom, 'action_end_time'])
166234
]
167-
summary['total_requests'] = len(requests)
235+
summary.loc[zoom, 'total_requests'] = len(requests)
168236
if url_filter:
169237
requests = requests[requests['url'].str.contains(url_filter)]
170-
summary['filtered_requests'] = len(requests)
171-
if requests['request_start'].max() > actions.loc[zoom, 'action_end_time']:
172-
raise Warning(f'Request for zoom level {zoom} started after timeout')
173-
if requests['response_end'].max() > actions.loc[zoom, 'action_end_time']:
174-
raise Warning(f'Response duration for zoom level {zoom} exceeded timeout')
238+
summary.loc[zoom, 'filtered_requests'] = len(requests)
239+
summary.loc[zoom, 'filtered_requests_average_encoded_data_length'] = requests[
240+
'encoded_data_length'
241+
].mean()
242+
summary.loc[zoom, 'filtered_requests_maximum_encoded_data_length'] = requests[
243+
'encoded_data_length'
244+
].max()
175245
summary.loc[zoom, 'zoom'] = zoom
176246
summary.loc[zoom, 'duration'] = actions.loc[zoom, 'duration']
247+
summary.loc[zoom, 'timeout'] = False
248+
if requests['request_start'].max() > actions.loc[zoom, 'action_end_time']:
249+
actions.loc[zoom, 'action_end_time'] = np.nan
250+
summary.loc[zoom, 'duration'] = metadata['timeout']
251+
summary.loc[zoom, 'timeout'] = True
252+
if requests['response_end'].max() > actions.loc[zoom, 'action_end_time']:
253+
actions.loc[zoom, 'action_end_time'] = np.nan
254+
summary.loc[zoom, 'duration'] = metadata['timeout']
255+
summary.loc[zoom, 'timeout'] = True
256+
if summary.loc[zoom, 'duration'] > metadata['timeout']:
257+
summary.loc[zoom, 'duration'] = metadata['timeout']
258+
summary.loc[zoom, 'timeout'] = True
177259
summary.loc[zoom, 'fps'] = len(frames) / (actions.loc[zoom, 'duration'] * 1e-3)
178260
if requests.empty:
179261
summary.loc[zoom, 'request_duration'] = 0
@@ -183,6 +265,7 @@ def create_summary(*, metadata: pd.DataFrame, data: dict, url_filter: str = None
183265
)
184266
summary['request_percent'] = summary['request_duration'] / summary['duration'] * 100
185267
summary['non_request_duration'] = summary['duration'] - summary['request_duration']
268+
summary = add_chunk_size(summary)
186269

187270
return summary
188271

0 commit comments

Comments
 (0)