Skip to content

Commit e1c18c8

Browse files
committed
Merge branch 'main' of https://github.com/sdpython/onnx-diagnostic into gem
2 parents 1480fdf + 205b288 commit e1c18c8

File tree

4 files changed

+113
-20
lines changed

4 files changed

+113
-20
lines changed

CHANGELOGS.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ Change Logs
44
0.7.1
55
+++++
66

7+
* :pr:`156`, :pr:`157`: add plots and other options to deal with the unpredictable
78
* :pr:`155`: better aggregation of historical data
89
* :pr:`151`, :pr:`153`: adds command line ``agg``, class CubeLogsPerformance to produce timeseries
910
* :pr:`152`: add a function to compute fully dynamic shapes given any inputs

_unittests/ut_helpers/test_log_helper.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,7 @@ def test_enumerate_csv_files(self):
204204
self.assertEqual((3, 11), cube.shape)
205205
self.assertIn("RAWFILENAME", cube.data.columns)
206206

207-
def test_cube_logs_performance(self):
207+
def test_cube_logs_performance1(self):
208208
output = self.get_dump_file("test_cube_logs_performance.xlsx")
209209
filename = os.path.join(os.path.dirname(__file__), "data", "data-agg.zip")
210210
assert list(enumerate_csv_files(filename))
@@ -229,6 +229,31 @@ def test_cube_logs_performance(self):
229229
)
230230
self.assertExists(output)
231231

232+
def test_cube_logs_performance2(self):
233+
output = self.get_dump_file("test_cube_logs_performance.xlsx")
234+
filename = os.path.join(os.path.dirname(__file__), "data", "data-agg.zip")
235+
assert list(enumerate_csv_files(filename))
236+
dfs = [open_dataframe(df) for df in enumerate_csv_files(filename)]
237+
assert dfs, f"{filename!r} empty"
238+
cube = CubeLogsPerformance(dfs, keep_last_date=True)
239+
cube.load()
240+
cube.to_excel(
241+
output,
242+
views=[
243+
"agg-suite",
244+
"disc",
245+
"speedup",
246+
"counts",
247+
"time",
248+
"time_export",
249+
"err",
250+
# "cmd",
251+
"bucket-speedup",
252+
"raw-short",
253+
],
254+
)
255+
self.assertExists(output)
256+
232257
def test_duplicate(self):
233258
df = pandas.DataFrame(
234259
[

onnx_diagnostic/_command_lines_parser.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -667,6 +667,14 @@ def get_parser_agg() -> ArgumentParser:
667667
action=BooleanOptionalAction,
668668
help="Keeps only the most recent experiment for the same of keys.",
669669
)
670+
parser.add_argument(
671+
"--keep-last-date",
672+
default=False,
673+
action=BooleanOptionalAction,
674+
help="Rewrite all dates to the last one to simplifies the analysis, "
675+
"this assume changing the date does not add ambiguity, if any, option "
676+
"--recent should be added.",
677+
)
670678
parser.add_argument(
671679
"--raw",
672680
default=True,
@@ -683,6 +691,12 @@ def get_parser_agg() -> ArgumentParser:
683691
"multiple values are separated by `,`\n"
684692
"regular expressions are allowed",
685693
)
694+
parser.add_argument(
695+
"--drop-keys",
696+
default="",
697+
help="Drops keys from the given list. Something it is faster "
698+
"to remove one than to select all the remaining ones.",
699+
)
686700
parser.add_argument(
687701
"-w",
688702
"--values",
@@ -709,7 +723,7 @@ def get_parser_agg() -> ArgumentParser:
709723
)
710724
parser.add_argument(
711725
"--views",
712-
default="agg-suite,disc,speedup,time,time_export,err,cmd,"
726+
default="agg-suite,agg-all,disc,speedup,time,time_export,err,cmd,"
713727
"bucket-speedup,raw-short,counts,peak-gpu",
714728
help="Views to add to the output files.",
715729
)
@@ -749,14 +763,16 @@ def _cmd_agg(argv: List[Any]):
749763
), f"Missing time column {args.time!r} in {c!r}\n{df.head()}\n{sorted(df.columns)}"
750764
dfs.append(df)
751765

766+
drop_keys = set(args.drop_keys.split(","))
752767
cube = CubeLogsPerformance(
753768
dfs,
754769
time=args.time,
755-
keys=[a for a in args.keys.split(",") if a],
770+
keys=[a for a in args.keys.split(",") if a and a not in drop_keys],
756771
values=[a for a in args.values.split(",") if a],
757772
ignored=[a for a in args.ignored.split(",") if a],
758773
recent=args.recent,
759774
formulas={k: k for k in args.formula.split(",")},
775+
keep_last_date=args.keep_last_date,
760776
)
761777
cube.load(verbose=max(args.verbose - 1, 0))
762778
if args.verbose:

onnx_diagnostic/helpers/log_helper.py

Lines changed: 68 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -372,7 +372,7 @@ def to_images(
372372
if merge:
373373
nn = len(df.columns) // 2
374374
nn += nn % 2
375-
fig, axs = plt.subplots(nn, 2, figsize=(12, 3 * nn))
375+
fig, axs = plt.subplots(nn, 2, figsize=(12, 3 * nn * df.shape[0] / 12))
376376
pos = 0
377377
for c in loop:
378378
ax = axs[pos // 2, pos % 2]
@@ -455,6 +455,7 @@ def __init__(
455455
]
456456
] = None,
457457
fill_missing: Optional[Sequence[Tuple[str, Any]]] = None,
458+
keep_last_date: bool = False,
458459
):
459460
self._data = data
460461
self._time = time
@@ -464,6 +465,7 @@ def __init__(
464465
self.recent = recent
465466
self._formulas = formulas
466467
self.fill_missing = fill_missing
468+
self.keep_last_date = keep_last_date
467469

468470
def post_load_process_piece(
469471
self, df: pandas.DataFrame, unique: bool = False
@@ -613,6 +615,16 @@ def load(self, verbose: int = 0):
613615
if self.keys_with_nans:
614616
print(f"[CubeLogs.load] keys_with_nans={self.keys_with_nans}")
615617
self.data[self.time] = pandas.to_datetime(self.data[self.time])
618+
619+
if self.keep_last_date:
620+
times = self.data[self.time].dropna()
621+
mi, mx = times.min(), times.max()
622+
if mi != mx:
623+
print(f"[CubeLogs.load] setting all dates in column {self.time} to {mx!r}")
624+
self.data.loc[~self.data[self.time].isna(), self.time] = mx
625+
self.values_for_key[self.time] = {mx}
626+
if self.data[self.time].isna().max():
627+
self.values_for_key[self.time].add(np.nan)
616628
if verbose:
617629
print(f"[CubeLogs.load] done, shape={self.shape}")
618630
return self
@@ -821,11 +833,6 @@ def view(
821833
unique = set()
822834

823835
_md = lambda s: {k: v for k, v in self.values_for_key.items() if k in s} # noqa: E731
824-
assert key_index, (
825-
f"view_def.name={view_def.name!r}, "
826-
f"key_index should not be empty, got initially {key_index0!r}, "
827-
f"unique={_md(key_index0)}"
828-
)
829836
all_cols = set(key_columns) | set(key_index) | set(key_agg) | unique
830837
assert all_cols == set(self.keys_time), (
831838
f"view_def.name={view_def.name!r}, "
@@ -870,12 +877,6 @@ def view(
870877
key_columns = [c for c in key_columns if c not in seti]
871878
values = [c for c in values if c not in seti]
872879

873-
assert key_index, (
874-
f"view_def.name={view_def.name!r}, view_def={view_def}, "
875-
f"key_index is empty, key_columns={key_columns}, value={values}, "
876-
f"columns={data.columns},shape={data.shape}"
877-
)
878-
879880
# final verification
880881
if verbose:
881882
print(f"[CubeLogs.view] key_index={key_index}")
@@ -896,7 +897,14 @@ def view(
896897
# pivot
897898
if verbose:
898899
print(f"[CubeLogs.view] values={values}")
899-
piv = data.pivot(index=key_index[::-1], columns=key_columns, values=values)
900+
if key_index:
901+
piv = data.pivot(index=key_index[::-1], columns=key_columns, values=values)
902+
else:
903+
# pivot does return the same rank with it is empty.
904+
# Let's add arficially one
905+
data = data.copy()
906+
data["ALL"] = "ALL"
907+
piv = data.pivot(index=["ALL"], columns=key_columns, values=values)
900908
if isinstance(piv, pandas.Series):
901909
piv = piv.to_frame(name="series")
902910
names = list(piv.columns.names)
@@ -1106,7 +1114,7 @@ def to_excel(
11061114
if memory > 2**22:
11071115
msg = (
11081116
f"[CubeLogs.to_excel] skipping {name!r}, "
1109-
f"too big for excel {memory} bytes"
1117+
f"too big for excel with {memory} bytes"
11101118
)
11111119
if verbose:
11121120
print(msg)
@@ -1123,13 +1131,26 @@ def to_excel(
11231131
plots.append(CubePlot(df, kind="barh", orientation="row", split=True))
11241132
if raw:
11251133
assert main not in views, f"{main!r} is duplicated in views {sorted(views)}"
1126-
if verbose:
1127-
print(f"[CubeLogs.to_excel] add sheet {raw!r} with shape {self.shape}")
1128-
self.data.to_excel(writer, sheet_name=raw, freeze_panes=(1, 1), index=True)
11291134
# Too long.
11301135
# self._apply_excel_style(raw, writer, self.data)
11311136
if csv and "raw" in csv:
11321137
df.reset_index(drop=False).to_csv(f"{output}.raw.csv", index=False)
1138+
memory = df.memory_usage(deep=True).sum()
1139+
if memory > 2**22:
1140+
msg = (
1141+
f"[CubeLogs.to_excel] skipping 'raw', "
1142+
f"too big for excel with {memory} bytes"
1143+
)
1144+
if verbose:
1145+
print(msg)
1146+
else:
1147+
warnings.warn(msg, category=RuntimeWarning, stacklevel=0)
1148+
else:
1149+
if verbose:
1150+
print(f"[CubeLogs.to_excel] add sheet 'raw' with shape {self.shape}")
1151+
self.data.to_excel(
1152+
writer, sheet_name="raw", freeze_panes=(1, 1), index=True
1153+
)
11331154

11341155
if plots:
11351156
from openpyxl.drawing.image import Image
@@ -1236,6 +1257,7 @@ def __init__(
12361257
"time_export_unbiased",
12371258
),
12381259
fill_missing: Optional[Sequence[Tuple[str, Any]]] = (("model_attn_impl", "eager"),),
1260+
keep_last_date: bool = False,
12391261
):
12401262
super().__init__(
12411263
data=data,
@@ -1246,6 +1268,7 @@ def __init__(
12461268
recent=recent,
12471269
formulas=formulas,
12481270
fill_missing=fill_missing,
1271+
keep_last_date=keep_last_date,
12491272
)
12501273

12511274
def _process_formula(
@@ -1577,6 +1600,34 @@ def mean_geo(gr):
15771600
keep_columns_in_index=["suite"],
15781601
name="agg-suite",
15791602
order=order,
1603+
),
1604+
"agg-all": lambda: CubeViewDef(
1605+
key_index=index_cols,
1606+
values=self._filter_column(
1607+
[
1608+
"TIME_ITER",
1609+
"speedup",
1610+
"time_latency",
1611+
"time_latency_eager",
1612+
"time_export_success",
1613+
"time_export_unbiased",
1614+
"^n_.*",
1615+
"target_opset",
1616+
"onnx_filesize",
1617+
"onnx_weight_size_torch",
1618+
"onnx_weight_size_proto",
1619+
"onnx_n_nodes",
1620+
"peak_gpu_torch",
1621+
"peak_gpu_nvidia",
1622+
],
1623+
self.values,
1624+
),
1625+
ignore_unique=True,
1626+
key_agg=["model_name", "task", "model_task", "suite"],
1627+
agg_args=lambda column_name: "sum" if column_name.startswith("n_") else "mean",
1628+
agg_multi={"speedup_weighted": mean_weight, "speedup_geo": mean_geo},
1629+
name="agg-all",
1630+
order=order,
15801631
plots=True,
15811632
),
15821633
"disc": lambda: CubeViewDef(

0 commit comments

Comments
 (0)