diff --git a/CHANGELOGS.rst b/CHANGELOGS.rst index e36fec19..ac81fbb7 100644 --- a/CHANGELOGS.rst +++ b/CHANGELOGS.rst @@ -4,6 +4,7 @@ Change Logs 0.7.1 +++++ +* :pr:`156`, :pr:`157`: add plots and other options to deal with the unpredictable * :pr:`155`: better aggregation of historical data * :pr:`151`, :pr:`153`: adds command line ``agg``, class CubeLogsPerformance to produce timeseries * :pr:`152`: add a function to compute fully dynamic shapes given any inputs diff --git a/_unittests/ut_helpers/test_log_helper.py b/_unittests/ut_helpers/test_log_helper.py index cd8594c0..e3c1a6de 100644 --- a/_unittests/ut_helpers/test_log_helper.py +++ b/_unittests/ut_helpers/test_log_helper.py @@ -204,7 +204,7 @@ def test_enumerate_csv_files(self): self.assertEqual((3, 11), cube.shape) self.assertIn("RAWFILENAME", cube.data.columns) - def test_cube_logs_performance(self): + def test_cube_logs_performance1(self): output = self.get_dump_file("test_cube_logs_performance.xlsx") filename = os.path.join(os.path.dirname(__file__), "data", "data-agg.zip") assert list(enumerate_csv_files(filename)) @@ -229,6 +229,31 @@ def test_cube_logs_performance(self): ) self.assertExists(output) + def test_cube_logs_performance2(self): + output = self.get_dump_file("test_cube_logs_performance.xlsx") + filename = os.path.join(os.path.dirname(__file__), "data", "data-agg.zip") + assert list(enumerate_csv_files(filename)) + dfs = [open_dataframe(df) for df in enumerate_csv_files(filename)] + assert dfs, f"{filename!r} empty" + cube = CubeLogsPerformance(dfs, keep_last_date=True) + cube.load() + cube.to_excel( + output, + views=[ + "agg-suite", + "disc", + "speedup", + "counts", + "time", + "time_export", + "err", + # "cmd", + "bucket-speedup", + "raw-short", + ], + ) + self.assertExists(output) + def test_duplicate(self): df = pandas.DataFrame( [ diff --git a/onnx_diagnostic/_command_lines_parser.py b/onnx_diagnostic/_command_lines_parser.py index 806068d5..793fd4b8 100644 --- a/onnx_diagnostic/_command_lines_parser.py +++ b/onnx_diagnostic/_command_lines_parser.py @@ -637,6 +637,14 @@ def get_parser_agg() -> ArgumentParser: action=BooleanOptionalAction, help="Keeps only the most recent experiment for the same of keys.", ) + parser.add_argument( + "--keep-last-date", + default=False, + action=BooleanOptionalAction, + help="Rewrite all dates to the last one to simplifies the analysis, " + "this assume changing the date does not add ambiguity, if any, option " + "--recent should be added.", + ) parser.add_argument( "--raw", default=True, @@ -653,6 +661,12 @@ def get_parser_agg() -> ArgumentParser: "multiple values are separated by `,`\n" "regular expressions are allowed", ) + parser.add_argument( + "--drop-keys", + default="", + help="Drops keys from the given list. Something it is faster " + "to remove one than to select all the remaining ones.", + ) parser.add_argument( "-w", "--values", @@ -679,7 +693,7 @@ def get_parser_agg() -> ArgumentParser: ) parser.add_argument( "--views", - default="agg-suite,disc,speedup,time,time_export,err,cmd," + default="agg-suite,agg-all,disc,speedup,time,time_export,err,cmd," "bucket-speedup,raw-short,counts,peak-gpu", help="Views to add to the output files.", ) @@ -719,14 +733,16 @@ def _cmd_agg(argv: List[Any]): ), f"Missing time column {args.time!r} in {c!r}\n{df.head()}\n{sorted(df.columns)}" dfs.append(df) + drop_keys = set(args.drop_keys.split(",")) cube = CubeLogsPerformance( dfs, time=args.time, - keys=[a for a in args.keys.split(",") if a], + keys=[a for a in args.keys.split(",") if a and a not in drop_keys], values=[a for a in args.values.split(",") if a], ignored=[a for a in args.ignored.split(",") if a], recent=args.recent, formulas={k: k for k in args.formula.split(",")}, + keep_last_date=args.keep_last_date, ) cube.load(verbose=max(args.verbose - 1, 0)) if args.verbose: diff --git a/onnx_diagnostic/helpers/log_helper.py b/onnx_diagnostic/helpers/log_helper.py index 83ee77ac..cd786b23 100644 --- a/onnx_diagnostic/helpers/log_helper.py +++ b/onnx_diagnostic/helpers/log_helper.py @@ -372,7 +372,7 @@ def to_images( if merge: nn = len(df.columns) // 2 nn += nn % 2 - fig, axs = plt.subplots(nn, 2, figsize=(12, 3 * nn)) + fig, axs = plt.subplots(nn, 2, figsize=(12, 3 * nn * df.shape[0] / 12)) pos = 0 for c in loop: ax = axs[pos // 2, pos % 2] @@ -455,6 +455,7 @@ def __init__( ] ] = None, fill_missing: Optional[Sequence[Tuple[str, Any]]] = None, + keep_last_date: bool = False, ): self._data = data self._time = time @@ -464,6 +465,7 @@ def __init__( self.recent = recent self._formulas = formulas self.fill_missing = fill_missing + self.keep_last_date = keep_last_date def post_load_process_piece( self, df: pandas.DataFrame, unique: bool = False @@ -613,6 +615,16 @@ def load(self, verbose: int = 0): if self.keys_with_nans: print(f"[CubeLogs.load] keys_with_nans={self.keys_with_nans}") self.data[self.time] = pandas.to_datetime(self.data[self.time]) + + if self.keep_last_date: + times = self.data[self.time].dropna() + mi, mx = times.min(), times.max() + if mi != mx: + print(f"[CubeLogs.load] setting all dates in column {self.time} to {mx!r}") + self.data.loc[~self.data[self.time].isna(), self.time] = mx + self.values_for_key[self.time] = {mx} + if self.data[self.time].isna().max(): + self.values_for_key[self.time].add(np.nan) if verbose: print(f"[CubeLogs.load] done, shape={self.shape}") return self @@ -821,11 +833,6 @@ def view( unique = set() _md = lambda s: {k: v for k, v in self.values_for_key.items() if k in s} # noqa: E731 - assert key_index, ( - f"view_def.name={view_def.name!r}, " - f"key_index should not be empty, got initially {key_index0!r}, " - f"unique={_md(key_index0)}" - ) all_cols = set(key_columns) | set(key_index) | set(key_agg) | unique assert all_cols == set(self.keys_time), ( f"view_def.name={view_def.name!r}, " @@ -870,12 +877,6 @@ def view( key_columns = [c for c in key_columns if c not in seti] values = [c for c in values if c not in seti] - assert key_index, ( - f"view_def.name={view_def.name!r}, view_def={view_def}, " - f"key_index is empty, key_columns={key_columns}, value={values}, " - f"columns={data.columns},shape={data.shape}" - ) - # final verification if verbose: print(f"[CubeLogs.view] key_index={key_index}") @@ -896,7 +897,14 @@ def view( # pivot if verbose: print(f"[CubeLogs.view] values={values}") - piv = data.pivot(index=key_index[::-1], columns=key_columns, values=values) + if key_index: + piv = data.pivot(index=key_index[::-1], columns=key_columns, values=values) + else: + # pivot does return the same rank with it is empty. + # Let's add arficially one + data = data.copy() + data["ALL"] = "ALL" + piv = data.pivot(index=["ALL"], columns=key_columns, values=values) if isinstance(piv, pandas.Series): piv = piv.to_frame(name="series") names = list(piv.columns.names) @@ -1106,7 +1114,7 @@ def to_excel( if memory > 2**22: msg = ( f"[CubeLogs.to_excel] skipping {name!r}, " - f"too big for excel {memory} bytes" + f"too big for excel with {memory} bytes" ) if verbose: print(msg) @@ -1123,13 +1131,26 @@ def to_excel( plots.append(CubePlot(df, kind="barh", orientation="row", split=True)) if raw: assert main not in views, f"{main!r} is duplicated in views {sorted(views)}" - if verbose: - print(f"[CubeLogs.to_excel] add sheet {raw!r} with shape {self.shape}") - self.data.to_excel(writer, sheet_name=raw, freeze_panes=(1, 1), index=True) # Too long. # self._apply_excel_style(raw, writer, self.data) if csv and "raw" in csv: df.reset_index(drop=False).to_csv(f"{output}.raw.csv", index=False) + memory = df.memory_usage(deep=True).sum() + if memory > 2**22: + msg = ( + f"[CubeLogs.to_excel] skipping 'raw', " + f"too big for excel with {memory} bytes" + ) + if verbose: + print(msg) + else: + warnings.warn(msg, category=RuntimeWarning, stacklevel=0) + else: + if verbose: + print(f"[CubeLogs.to_excel] add sheet 'raw' with shape {self.shape}") + self.data.to_excel( + writer, sheet_name="raw", freeze_panes=(1, 1), index=True + ) if plots: from openpyxl.drawing.image import Image @@ -1236,6 +1257,7 @@ def __init__( "time_export_unbiased", ), fill_missing: Optional[Sequence[Tuple[str, Any]]] = (("model_attn_impl", "eager"),), + keep_last_date: bool = False, ): super().__init__( data=data, @@ -1246,6 +1268,7 @@ def __init__( recent=recent, formulas=formulas, fill_missing=fill_missing, + keep_last_date=keep_last_date, ) def _process_formula( @@ -1577,6 +1600,34 @@ def mean_geo(gr): keep_columns_in_index=["suite"], name="agg-suite", order=order, + ), + "agg-all": lambda: CubeViewDef( + key_index=index_cols, + values=self._filter_column( + [ + "TIME_ITER", + "speedup", + "time_latency", + "time_latency_eager", + "time_export_success", + "time_export_unbiased", + "^n_.*", + "target_opset", + "onnx_filesize", + "onnx_weight_size_torch", + "onnx_weight_size_proto", + "onnx_n_nodes", + "peak_gpu_torch", + "peak_gpu_nvidia", + ], + self.values, + ), + ignore_unique=True, + key_agg=["model_name", "task", "model_task", "suite"], + agg_args=lambda column_name: "sum" if column_name.startswith("n_") else "mean", + agg_multi={"speedup_weighted": mean_weight, "speedup_geo": mean_geo}, + name="agg-all", + order=order, plots=True, ), "disc": lambda: CubeViewDef(