55import fsspec
66import numpy as np
77import pandas as pd
8+ import zarrita
89
910from .parsing import extract_event_type , extract_frame_data , extract_request_data
1011
1112pd .options .plotting .backend = 'holoviews'
13+ pd .options .mode .chained_assignment = None
1214
1315
1416def base64_to_img (base64jpeg ):
@@ -30,7 +32,7 @@ def base64_to_img(base64jpeg):
3032 return cv .imdecode (arr , cv .IMREAD_COLOR )
3133
3234
33- def calculate_snapshot_rmse (* , trace_events , snapshots , metadata ):
35+ def calculate_snapshot_rmse (* , trace_events , snapshots , metadata , xstart : int = 133 ):
3436 """
3537 Extract screenshots from a list of Chromium trace events.
3638
@@ -57,7 +59,7 @@ def calculate_rmse(predictions, targets):
5759 var = f'rmse_snapshot_{ zoom_level } '
5860 for ind , row in screenshots .iterrows ():
5961 frame = base64_to_img (row ['args.snapshot' ])
60- screenshots .loc [ind , var ] = calculate_rmse (frame , snapshot )
62+ screenshots .loc [ind , var ] = calculate_rmse (frame [:, xstart :], snapshot [:, xstart :] )
6163 return screenshots
6264
6365
@@ -118,6 +120,28 @@ def load_data(*, metadata_path: str, run: int):
118120 metadata ['full_trace_path' ] = trace_path
119121 with fs .open (trace_path ) as f :
120122 trace_events = json .loads (f .read ())['traceEvents' ]
123+ event_types = [
124+ 'ResourceSendRequest' ,
125+ 'ResourceFinish' ,
126+ 'BeginFrame' ,
127+ 'DrawFrame' ,
128+ 'DroppedFrame' ,
129+ 'Commit' ,
130+ 'Screenshot' ,
131+ 'benchmark-initial-load:start' ,
132+ 'benchmark-initial-load:end' ,
133+ 'benchmark-zoom_in-level-0:start'
134+ 'benchmark-zoom_in-level-1:start'
135+ 'benchmark-zoom_in-level-2:start'
136+ 'benchmark-zoom_in-level-0:end'
137+ 'benchmark-zoom_in-level-1:end'
138+ 'benchmark-zoom_in-level-2:end' ,
139+ ]
140+ trace_events = [
141+ event
142+ for event in trace_events
143+ if event ['name' ] in event_types or 'benchmark-zoom' in event ['name' ]
144+ ]
121145 return metadata , trace_events
122146
123147
@@ -141,16 +165,60 @@ def load_snapshots(*, snapshot_path: str):
141165 return snapshots
142166
143167
168+ def get_chunk_size (URI , zarr_version , sharded , var = 'tasmax' ):
169+ """
170+ Get chunk size based on zoom level 0.
171+ """
172+ source_store = zarrita .RemoteStore (URI )
173+ if zarr_version == 2 :
174+ source_array = zarrita .ArrayV2 .open (source_store / '0' / var )
175+ chunks = source_array .metadata .chunks
176+ itemsize = source_array .metadata .dtype .itemsize
177+ else :
178+ source_array = zarrita .Array .open (source_store / '0' / var )
179+ if sharded :
180+ chunks = source_array .metadata .codecs [0 ].configuration .chunk_shape
181+ else :
182+ chunks = source_array .metadata .chunk_grid .configuration .chunk_shape
183+ itemsize = source_array .metadata .dtype .itemsize
184+ chunk_size = np .prod (chunks ) * itemsize * 1e-6
185+ return chunk_size
186+
187+
188+ def add_chunk_size (
189+ summary : pd .DataFrame ,
190+ * ,
191+ root_path : str = 's3://carbonplan-benchmarks/data/NEX-GDDP-CMIP6/ACCESS-CM2/historical/r1i1p1f1/tasmax/tasmax_day_ACCESS-CM2_historical_r1i1p1f1_gn' ,
192+ ):
193+ """
194+ Add a column to the summary DataFrame containing the chunk size.
195+ """
196+ datasets = summary [
197+ ['zarr_version' , 'dataset' , 'shard_size' , 'target_chunk_size' ]
198+ ].drop_duplicates ()
199+ datasets ['URI' ] = root_path + '/' + datasets ['dataset' ]
200+ datasets ['actual_chunk_size' ] = datasets .apply (
201+ lambda x : get_chunk_size (x ['URI' ], x ['zarr_version' ], x ['shard_size' ]), axis = 1
202+ )
203+ datasets = datasets [['dataset' , 'actual_chunk_size' ]]
204+ return summary .set_index ('dataset' ).join (datasets .set_index ('dataset' ))
205+
206+
144207def create_summary (* , metadata : pd .DataFrame , data : dict , url_filter : str = None ):
145208 """
146209 Create summary DataFrame for a given run
147210 """
148- metadata
149211 summary = pd .concat (
150212 [pd .DataFrame (metadata , index = [0 ])] * (metadata ['zoom_level' ] + 1 ), ignore_index = True
151213 )
214+ summary ['metadata_path' ] = metadata ['metadata_path' ]
215+ summary ['trace_path' ] = metadata ['trace_path' ]
152216 summary ['zarr_version' ] = summary ['dataset' ].apply (lambda x : int (x .split ('-' )[1 ][1 ]))
153- summary ['chunk_size' ] = summary ['dataset' ].apply (lambda x : int (x .split ('-' )[5 ]))
217+ summary ['projection' ] = summary ['dataset' ].apply (lambda x : int (x .split ('-' )[2 ]))
218+ summary ['pixels_per_tile' ] = summary ['dataset' ].apply (lambda x : int (x .split ('-' )[4 ]))
219+ summary ['target_chunk_size' ] = summary ['dataset' ].apply (lambda x : int (x .split ('-' )[5 ]))
220+ summary ['shard_orientation' ] = summary ['dataset' ].apply (lambda x : x .split ('-' )[6 ])
221+ summary ['shard_size' ] = summary ['dataset' ].apply (lambda x : int (x .split ('-' )[7 ]))
154222 frames_data = data ['frames_data' ]
155223 request_data = data ['request_data' ]
156224
@@ -164,16 +232,30 @@ def create_summary(*, metadata: pd.DataFrame, data: dict, url_filter: str = None
164232 (request_data ['request_start' ] > actions .loc [zoom , 'start_time' ])
165233 & (request_data ['request_start' ] <= actions .loc [zoom , 'action_end_time' ])
166234 ]
167- summary [ 'total_requests' ] = len (requests )
235+ summary . loc [ zoom , 'total_requests' ] = len (requests )
168236 if url_filter :
169237 requests = requests [requests ['url' ].str .contains (url_filter )]
170- summary ['filtered_requests' ] = len (requests )
171- if requests ['request_start' ].max () > actions .loc [zoom , 'action_end_time' ]:
172- raise Warning (f'Request for zoom level { zoom } started after timeout' )
173- if requests ['response_end' ].max () > actions .loc [zoom , 'action_end_time' ]:
174- raise Warning (f'Response duration for zoom level { zoom } exceeded timeout' )
238+ summary .loc [zoom , 'filtered_requests' ] = len (requests )
239+ summary .loc [zoom , 'filtered_requests_average_encoded_data_length' ] = requests [
240+ 'encoded_data_length'
241+ ].mean ()
242+ summary .loc [zoom , 'filtered_requests_maximum_encoded_data_length' ] = requests [
243+ 'encoded_data_length'
244+ ].max ()
175245 summary .loc [zoom , 'zoom' ] = zoom
176246 summary .loc [zoom , 'duration' ] = actions .loc [zoom , 'duration' ]
247+ summary .loc [zoom , 'timeout' ] = False
248+ if requests ['request_start' ].max () > actions .loc [zoom , 'action_end_time' ]:
249+ actions .loc [zoom , 'action_end_time' ] = np .nan
250+ summary .loc [zoom , 'duration' ] = metadata ['timeout' ]
251+ summary .loc [zoom , 'timeout' ] = True
252+ if requests ['response_end' ].max () > actions .loc [zoom , 'action_end_time' ]:
253+ actions .loc [zoom , 'action_end_time' ] = np .nan
254+ summary .loc [zoom , 'duration' ] = metadata ['timeout' ]
255+ summary .loc [zoom , 'timeout' ] = True
256+ if summary .loc [zoom , 'duration' ] > metadata ['timeout' ]:
257+ summary .loc [zoom , 'duration' ] = metadata ['timeout' ]
258+ summary .loc [zoom , 'timeout' ] = True
177259 summary .loc [zoom , 'fps' ] = len (frames ) / (actions .loc [zoom , 'duration' ] * 1e-3 )
178260 if requests .empty :
179261 summary .loc [zoom , 'request_duration' ] = 0
@@ -183,6 +265,7 @@ def create_summary(*, metadata: pd.DataFrame, data: dict, url_filter: str = None
183265 )
184266 summary ['request_percent' ] = summary ['request_duration' ] / summary ['duration' ] * 100
185267 summary ['non_request_duration' ] = summary ['duration' ] - summary ['request_duration' ]
268+ summary = add_chunk_size (summary )
186269
187270 return summary
188271
0 commit comments