Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOGS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ Change Logs
0.7.1
+++++

* :pr:`156`, :pr:`157`: add plots and other options to deal with the unpredictable
* :pr:`155`: better aggregation of historical data
* :pr:`151`, :pr:`153`: adds command line ``agg``, class CubeLogsPerformance to produce timeseries
* :pr:`152`: add a function to compute fully dynamic shapes given any inputs
Expand Down
27 changes: 26 additions & 1 deletion _unittests/ut_helpers/test_log_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ def test_enumerate_csv_files(self):
self.assertEqual((3, 11), cube.shape)
self.assertIn("RAWFILENAME", cube.data.columns)

def test_cube_logs_performance(self):
def test_cube_logs_performance1(self):
output = self.get_dump_file("test_cube_logs_performance.xlsx")
filename = os.path.join(os.path.dirname(__file__), "data", "data-agg.zip")
assert list(enumerate_csv_files(filename))
Expand All @@ -229,6 +229,31 @@ def test_cube_logs_performance(self):
)
self.assertExists(output)

def test_cube_logs_performance2(self):
output = self.get_dump_file("test_cube_logs_performance.xlsx")
filename = os.path.join(os.path.dirname(__file__), "data", "data-agg.zip")
assert list(enumerate_csv_files(filename))
dfs = [open_dataframe(df) for df in enumerate_csv_files(filename)]
assert dfs, f"{filename!r} empty"
cube = CubeLogsPerformance(dfs, keep_last_date=True)
cube.load()
cube.to_excel(
output,
views=[
"agg-suite",
"disc",
"speedup",
"counts",
"time",
"time_export",
"err",
# "cmd",
"bucket-speedup",
"raw-short",
],
)
self.assertExists(output)

def test_duplicate(self):
df = pandas.DataFrame(
[
Expand Down
20 changes: 18 additions & 2 deletions onnx_diagnostic/_command_lines_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -637,6 +637,14 @@ def get_parser_agg() -> ArgumentParser:
action=BooleanOptionalAction,
help="Keeps only the most recent experiment for the same of keys.",
)
parser.add_argument(
"--keep-last-date",
default=False,
action=BooleanOptionalAction,
help="Rewrite all dates to the last one to simplifies the analysis, "
"this assume changing the date does not add ambiguity, if any, option "
"--recent should be added.",
)
parser.add_argument(
"--raw",
default=True,
Expand All @@ -653,6 +661,12 @@ def get_parser_agg() -> ArgumentParser:
"multiple values are separated by `,`\n"
"regular expressions are allowed",
)
parser.add_argument(
"--drop-keys",
default="",
help="Drops keys from the given list. Something it is faster "
"to remove one than to select all the remaining ones.",
)
parser.add_argument(
"-w",
"--values",
Expand All @@ -679,7 +693,7 @@ def get_parser_agg() -> ArgumentParser:
)
parser.add_argument(
"--views",
default="agg-suite,disc,speedup,time,time_export,err,cmd,"
default="agg-suite,agg-all,disc,speedup,time,time_export,err,cmd,"
"bucket-speedup,raw-short,counts,peak-gpu",
help="Views to add to the output files.",
)
Expand Down Expand Up @@ -719,14 +733,16 @@ def _cmd_agg(argv: List[Any]):
), f"Missing time column {args.time!r} in {c!r}\n{df.head()}\n{sorted(df.columns)}"
dfs.append(df)

drop_keys = set(args.drop_keys.split(","))
cube = CubeLogsPerformance(
dfs,
time=args.time,
keys=[a for a in args.keys.split(",") if a],
keys=[a for a in args.keys.split(",") if a and a not in drop_keys],
values=[a for a in args.values.split(",") if a],
ignored=[a for a in args.ignored.split(",") if a],
recent=args.recent,
formulas={k: k for k in args.formula.split(",")},
keep_last_date=args.keep_last_date,
)
cube.load(verbose=max(args.verbose - 1, 0))
if args.verbose:
Expand Down
85 changes: 68 additions & 17 deletions onnx_diagnostic/helpers/log_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,7 @@ def to_images(
if merge:
nn = len(df.columns) // 2
nn += nn % 2
fig, axs = plt.subplots(nn, 2, figsize=(12, 3 * nn))
fig, axs = plt.subplots(nn, 2, figsize=(12, 3 * nn * df.shape[0] / 12))
pos = 0
for c in loop:
ax = axs[pos // 2, pos % 2]
Expand Down Expand Up @@ -455,6 +455,7 @@ def __init__(
]
] = None,
fill_missing: Optional[Sequence[Tuple[str, Any]]] = None,
keep_last_date: bool = False,
):
self._data = data
self._time = time
Expand All @@ -464,6 +465,7 @@ def __init__(
self.recent = recent
self._formulas = formulas
self.fill_missing = fill_missing
self.keep_last_date = keep_last_date

def post_load_process_piece(
self, df: pandas.DataFrame, unique: bool = False
Expand Down Expand Up @@ -613,6 +615,16 @@ def load(self, verbose: int = 0):
if self.keys_with_nans:
print(f"[CubeLogs.load] keys_with_nans={self.keys_with_nans}")
self.data[self.time] = pandas.to_datetime(self.data[self.time])

if self.keep_last_date:
times = self.data[self.time].dropna()
mi, mx = times.min(), times.max()
if mi != mx:
print(f"[CubeLogs.load] setting all dates in column {self.time} to {mx!r}")
self.data.loc[~self.data[self.time].isna(), self.time] = mx
self.values_for_key[self.time] = {mx}
if self.data[self.time].isna().max():
self.values_for_key[self.time].add(np.nan)
if verbose:
print(f"[CubeLogs.load] done, shape={self.shape}")
return self
Expand Down Expand Up @@ -821,11 +833,6 @@ def view(
unique = set()

_md = lambda s: {k: v for k, v in self.values_for_key.items() if k in s} # noqa: E731
assert key_index, (
f"view_def.name={view_def.name!r}, "
f"key_index should not be empty, got initially {key_index0!r}, "
f"unique={_md(key_index0)}"
)
all_cols = set(key_columns) | set(key_index) | set(key_agg) | unique
assert all_cols == set(self.keys_time), (
f"view_def.name={view_def.name!r}, "
Expand Down Expand Up @@ -870,12 +877,6 @@ def view(
key_columns = [c for c in key_columns if c not in seti]
values = [c for c in values if c not in seti]

assert key_index, (
f"view_def.name={view_def.name!r}, view_def={view_def}, "
f"key_index is empty, key_columns={key_columns}, value={values}, "
f"columns={data.columns},shape={data.shape}"
)

# final verification
if verbose:
print(f"[CubeLogs.view] key_index={key_index}")
Expand All @@ -896,7 +897,14 @@ def view(
# pivot
if verbose:
print(f"[CubeLogs.view] values={values}")
piv = data.pivot(index=key_index[::-1], columns=key_columns, values=values)
if key_index:
piv = data.pivot(index=key_index[::-1], columns=key_columns, values=values)
else:
# pivot does return the same rank with it is empty.
# Let's add arficially one
data = data.copy()
data["ALL"] = "ALL"
piv = data.pivot(index=["ALL"], columns=key_columns, values=values)
if isinstance(piv, pandas.Series):
piv = piv.to_frame(name="series")
names = list(piv.columns.names)
Expand Down Expand Up @@ -1106,7 +1114,7 @@ def to_excel(
if memory > 2**22:
msg = (
f"[CubeLogs.to_excel] skipping {name!r}, "
f"too big for excel {memory} bytes"
f"too big for excel with {memory} bytes"
)
if verbose:
print(msg)
Expand All @@ -1123,13 +1131,26 @@ def to_excel(
plots.append(CubePlot(df, kind="barh", orientation="row", split=True))
if raw:
assert main not in views, f"{main!r} is duplicated in views {sorted(views)}"
if verbose:
print(f"[CubeLogs.to_excel] add sheet {raw!r} with shape {self.shape}")
self.data.to_excel(writer, sheet_name=raw, freeze_panes=(1, 1), index=True)
# Too long.
# self._apply_excel_style(raw, writer, self.data)
if csv and "raw" in csv:
df.reset_index(drop=False).to_csv(f"{output}.raw.csv", index=False)
memory = df.memory_usage(deep=True).sum()
if memory > 2**22:
msg = (
f"[CubeLogs.to_excel] skipping 'raw', "
f"too big for excel with {memory} bytes"
)
if verbose:
print(msg)
else:
warnings.warn(msg, category=RuntimeWarning, stacklevel=0)
else:
if verbose:
print(f"[CubeLogs.to_excel] add sheet 'raw' with shape {self.shape}")
self.data.to_excel(
writer, sheet_name="raw", freeze_panes=(1, 1), index=True
)

if plots:
from openpyxl.drawing.image import Image
Expand Down Expand Up @@ -1236,6 +1257,7 @@ def __init__(
"time_export_unbiased",
),
fill_missing: Optional[Sequence[Tuple[str, Any]]] = (("model_attn_impl", "eager"),),
keep_last_date: bool = False,
):
super().__init__(
data=data,
Expand All @@ -1246,6 +1268,7 @@ def __init__(
recent=recent,
formulas=formulas,
fill_missing=fill_missing,
keep_last_date=keep_last_date,
)

def _process_formula(
Expand Down Expand Up @@ -1577,6 +1600,34 @@ def mean_geo(gr):
keep_columns_in_index=["suite"],
name="agg-suite",
order=order,
),
"agg-all": lambda: CubeViewDef(
key_index=index_cols,
values=self._filter_column(
[
"TIME_ITER",
"speedup",
"time_latency",
"time_latency_eager",
"time_export_success",
"time_export_unbiased",
"^n_.*",
"target_opset",
"onnx_filesize",
"onnx_weight_size_torch",
"onnx_weight_size_proto",
"onnx_n_nodes",
"peak_gpu_torch",
"peak_gpu_nvidia",
],
self.values,
),
ignore_unique=True,
key_agg=["model_name", "task", "model_task", "suite"],
agg_args=lambda column_name: "sum" if column_name.startswith("n_") else "mean",
agg_multi={"speedup_weighted": mean_weight, "speedup_geo": mean_geo},
name="agg-all",
order=order,
plots=True,
),
"disc": lambda: CubeViewDef(
Expand Down
Loading