Skip to content

Commit 205b288

Browse files
authored
Minor fixes (#157)
* Fix minor details * ch * fix issues
1 parent 432b14d commit 205b288

File tree

4 files changed

+113
-20
lines changed

4 files changed

+113
-20
lines changed

CHANGELOGS.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ Change Logs
44
0.7.1
55
+++++
66

7+
* :pr:`156`, :pr:`157`: add plots and other options to deal with the unpredictable
78
* :pr:`155`: better aggregation of historical data
89
* :pr:`151`, :pr:`153`: adds command line ``agg``, class CubeLogsPerformance to produce timeseries
910
* :pr:`152`: add a function to compute fully dynamic shapes given any inputs

_unittests/ut_helpers/test_log_helper.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,7 @@ def test_enumerate_csv_files(self):
204204
self.assertEqual((3, 11), cube.shape)
205205
self.assertIn("RAWFILENAME", cube.data.columns)
206206

207-
def test_cube_logs_performance(self):
207+
def test_cube_logs_performance1(self):
208208
output = self.get_dump_file("test_cube_logs_performance.xlsx")
209209
filename = os.path.join(os.path.dirname(__file__), "data", "data-agg.zip")
210210
assert list(enumerate_csv_files(filename))
@@ -229,6 +229,31 @@ def test_cube_logs_performance(self):
229229
)
230230
self.assertExists(output)
231231

232+
def test_cube_logs_performance2(self):
233+
output = self.get_dump_file("test_cube_logs_performance.xlsx")
234+
filename = os.path.join(os.path.dirname(__file__), "data", "data-agg.zip")
235+
assert list(enumerate_csv_files(filename))
236+
dfs = [open_dataframe(df) for df in enumerate_csv_files(filename)]
237+
assert dfs, f"{filename!r} empty"
238+
cube = CubeLogsPerformance(dfs, keep_last_date=True)
239+
cube.load()
240+
cube.to_excel(
241+
output,
242+
views=[
243+
"agg-suite",
244+
"disc",
245+
"speedup",
246+
"counts",
247+
"time",
248+
"time_export",
249+
"err",
250+
# "cmd",
251+
"bucket-speedup",
252+
"raw-short",
253+
],
254+
)
255+
self.assertExists(output)
256+
232257
def test_duplicate(self):
233258
df = pandas.DataFrame(
234259
[

onnx_diagnostic/_command_lines_parser.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -637,6 +637,14 @@ def get_parser_agg() -> ArgumentParser:
637637
action=BooleanOptionalAction,
638638
help="Keeps only the most recent experiment for the same of keys.",
639639
)
640+
parser.add_argument(
641+
"--keep-last-date",
642+
default=False,
643+
action=BooleanOptionalAction,
644+
help="Rewrite all dates to the last one to simplifies the analysis, "
645+
"this assume changing the date does not add ambiguity, if any, option "
646+
"--recent should be added.",
647+
)
640648
parser.add_argument(
641649
"--raw",
642650
default=True,
@@ -653,6 +661,12 @@ def get_parser_agg() -> ArgumentParser:
653661
"multiple values are separated by `,`\n"
654662
"regular expressions are allowed",
655663
)
664+
parser.add_argument(
665+
"--drop-keys",
666+
default="",
667+
help="Drops keys from the given list. Something it is faster "
668+
"to remove one than to select all the remaining ones.",
669+
)
656670
parser.add_argument(
657671
"-w",
658672
"--values",
@@ -679,7 +693,7 @@ def get_parser_agg() -> ArgumentParser:
679693
)
680694
parser.add_argument(
681695
"--views",
682-
default="agg-suite,disc,speedup,time,time_export,err,cmd,"
696+
default="agg-suite,agg-all,disc,speedup,time,time_export,err,cmd,"
683697
"bucket-speedup,raw-short,counts,peak-gpu",
684698
help="Views to add to the output files.",
685699
)
@@ -719,14 +733,16 @@ def _cmd_agg(argv: List[Any]):
719733
), f"Missing time column {args.time!r} in {c!r}\n{df.head()}\n{sorted(df.columns)}"
720734
dfs.append(df)
721735

736+
drop_keys = set(args.drop_keys.split(","))
722737
cube = CubeLogsPerformance(
723738
dfs,
724739
time=args.time,
725-
keys=[a for a in args.keys.split(",") if a],
740+
keys=[a for a in args.keys.split(",") if a and a not in drop_keys],
726741
values=[a for a in args.values.split(",") if a],
727742
ignored=[a for a in args.ignored.split(",") if a],
728743
recent=args.recent,
729744
formulas={k: k for k in args.formula.split(",")},
745+
keep_last_date=args.keep_last_date,
730746
)
731747
cube.load(verbose=max(args.verbose - 1, 0))
732748
if args.verbose:

onnx_diagnostic/helpers/log_helper.py

Lines changed: 68 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -372,7 +372,7 @@ def to_images(
372372
if merge:
373373
nn = len(df.columns) // 2
374374
nn += nn % 2
375-
fig, axs = plt.subplots(nn, 2, figsize=(12, 3 * nn))
375+
fig, axs = plt.subplots(nn, 2, figsize=(12, 3 * nn * df.shape[0] / 12))
376376
pos = 0
377377
for c in loop:
378378
ax = axs[pos // 2, pos % 2]
@@ -455,6 +455,7 @@ def __init__(
455455
]
456456
] = None,
457457
fill_missing: Optional[Sequence[Tuple[str, Any]]] = None,
458+
keep_last_date: bool = False,
458459
):
459460
self._data = data
460461
self._time = time
@@ -464,6 +465,7 @@ def __init__(
464465
self.recent = recent
465466
self._formulas = formulas
466467
self.fill_missing = fill_missing
468+
self.keep_last_date = keep_last_date
467469

468470
def post_load_process_piece(
469471
self, df: pandas.DataFrame, unique: bool = False
@@ -613,6 +615,16 @@ def load(self, verbose: int = 0):
613615
if self.keys_with_nans:
614616
print(f"[CubeLogs.load] keys_with_nans={self.keys_with_nans}")
615617
self.data[self.time] = pandas.to_datetime(self.data[self.time])
618+
619+
if self.keep_last_date:
620+
times = self.data[self.time].dropna()
621+
mi, mx = times.min(), times.max()
622+
if mi != mx:
623+
print(f"[CubeLogs.load] setting all dates in column {self.time} to {mx!r}")
624+
self.data.loc[~self.data[self.time].isna(), self.time] = mx
625+
self.values_for_key[self.time] = {mx}
626+
if self.data[self.time].isna().max():
627+
self.values_for_key[self.time].add(np.nan)
616628
if verbose:
617629
print(f"[CubeLogs.load] done, shape={self.shape}")
618630
return self
@@ -821,11 +833,6 @@ def view(
821833
unique = set()
822834

823835
_md = lambda s: {k: v for k, v in self.values_for_key.items() if k in s} # noqa: E731
824-
assert key_index, (
825-
f"view_def.name={view_def.name!r}, "
826-
f"key_index should not be empty, got initially {key_index0!r}, "
827-
f"unique={_md(key_index0)}"
828-
)
829836
all_cols = set(key_columns) | set(key_index) | set(key_agg) | unique
830837
assert all_cols == set(self.keys_time), (
831838
f"view_def.name={view_def.name!r}, "
@@ -870,12 +877,6 @@ def view(
870877
key_columns = [c for c in key_columns if c not in seti]
871878
values = [c for c in values if c not in seti]
872879

873-
assert key_index, (
874-
f"view_def.name={view_def.name!r}, view_def={view_def}, "
875-
f"key_index is empty, key_columns={key_columns}, value={values}, "
876-
f"columns={data.columns},shape={data.shape}"
877-
)
878-
879880
# final verification
880881
if verbose:
881882
print(f"[CubeLogs.view] key_index={key_index}")
@@ -896,7 +897,14 @@ def view(
896897
# pivot
897898
if verbose:
898899
print(f"[CubeLogs.view] values={values}")
899-
piv = data.pivot(index=key_index[::-1], columns=key_columns, values=values)
900+
if key_index:
901+
piv = data.pivot(index=key_index[::-1], columns=key_columns, values=values)
902+
else:
903+
# pivot does return the same rank with it is empty.
904+
# Let's add arficially one
905+
data = data.copy()
906+
data["ALL"] = "ALL"
907+
piv = data.pivot(index=["ALL"], columns=key_columns, values=values)
900908
if isinstance(piv, pandas.Series):
901909
piv = piv.to_frame(name="series")
902910
names = list(piv.columns.names)
@@ -1106,7 +1114,7 @@ def to_excel(
11061114
if memory > 2**22:
11071115
msg = (
11081116
f"[CubeLogs.to_excel] skipping {name!r}, "
1109-
f"too big for excel {memory} bytes"
1117+
f"too big for excel with {memory} bytes"
11101118
)
11111119
if verbose:
11121120
print(msg)
@@ -1123,13 +1131,26 @@ def to_excel(
11231131
plots.append(CubePlot(df, kind="barh", orientation="row", split=True))
11241132
if raw:
11251133
assert main not in views, f"{main!r} is duplicated in views {sorted(views)}"
1126-
if verbose:
1127-
print(f"[CubeLogs.to_excel] add sheet {raw!r} with shape {self.shape}")
1128-
self.data.to_excel(writer, sheet_name=raw, freeze_panes=(1, 1), index=True)
11291134
# Too long.
11301135
# self._apply_excel_style(raw, writer, self.data)
11311136
if csv and "raw" in csv:
11321137
df.reset_index(drop=False).to_csv(f"{output}.raw.csv", index=False)
1138+
memory = df.memory_usage(deep=True).sum()
1139+
if memory > 2**22:
1140+
msg = (
1141+
f"[CubeLogs.to_excel] skipping 'raw', "
1142+
f"too big for excel with {memory} bytes"
1143+
)
1144+
if verbose:
1145+
print(msg)
1146+
else:
1147+
warnings.warn(msg, category=RuntimeWarning, stacklevel=0)
1148+
else:
1149+
if verbose:
1150+
print(f"[CubeLogs.to_excel] add sheet 'raw' with shape {self.shape}")
1151+
self.data.to_excel(
1152+
writer, sheet_name="raw", freeze_panes=(1, 1), index=True
1153+
)
11331154

11341155
if plots:
11351156
from openpyxl.drawing.image import Image
@@ -1236,6 +1257,7 @@ def __init__(
12361257
"time_export_unbiased",
12371258
),
12381259
fill_missing: Optional[Sequence[Tuple[str, Any]]] = (("model_attn_impl", "eager"),),
1260+
keep_last_date: bool = False,
12391261
):
12401262
super().__init__(
12411263
data=data,
@@ -1246,6 +1268,7 @@ def __init__(
12461268
recent=recent,
12471269
formulas=formulas,
12481270
fill_missing=fill_missing,
1271+
keep_last_date=keep_last_date,
12491272
)
12501273

12511274
def _process_formula(
@@ -1577,6 +1600,34 @@ def mean_geo(gr):
15771600
keep_columns_in_index=["suite"],
15781601
name="agg-suite",
15791602
order=order,
1603+
),
1604+
"agg-all": lambda: CubeViewDef(
1605+
key_index=index_cols,
1606+
values=self._filter_column(
1607+
[
1608+
"TIME_ITER",
1609+
"speedup",
1610+
"time_latency",
1611+
"time_latency_eager",
1612+
"time_export_success",
1613+
"time_export_unbiased",
1614+
"^n_.*",
1615+
"target_opset",
1616+
"onnx_filesize",
1617+
"onnx_weight_size_torch",
1618+
"onnx_weight_size_proto",
1619+
"onnx_n_nodes",
1620+
"peak_gpu_torch",
1621+
"peak_gpu_nvidia",
1622+
],
1623+
self.values,
1624+
),
1625+
ignore_unique=True,
1626+
key_agg=["model_name", "task", "model_task", "suite"],
1627+
agg_args=lambda column_name: "sum" if column_name.startswith("n_") else "mean",
1628+
agg_multi={"speedup_weighted": mean_weight, "speedup_geo": mean_geo},
1629+
name="agg-all",
1630+
order=order,
15801631
plots=True,
15811632
),
15821633
"disc": lambda: CubeViewDef(

0 commit comments

Comments
 (0)