excelé

xadupre · xadupre · commit 930b984710ed · 2025-06-16T12:06:45.000+02:00
diff --git a/_unittests/ut_helpers/test_log_helper.py b/_unittests/ut_helpers/test_log_helper.py
@@ -57,6 +57,13 @@ def test_cube_logs_load_df(self):
         self.assertEqual((3, df.shape[1] + 1), cube.shape)
         self.assertEqual(set(cube.columns), {*df.columns, "speedup"})
 
+    @hide_stdout()
+    def test_cube_logs_load_dfdf(self):
+        df = self.df1()
+        cube = CubeLogs([df, df], recent=True)
+        cube.load(verbose=1)
+        self.assertEqual((3, 10), cube.shape)
+
     @hide_stdout()
     def test_cube_logs_load_list(self):
         cube = CubeLogs(
@@ -174,6 +181,11 @@ def test_enumerate_csv_files(self):
         for df in dfs:
             open_dataframe(df)
 
+        cube = CubeLogs(data, recent=True)
+        cube.load(verbose=1)
+        self.assertEqual((3, 11), cube.shape)
+        self.assertIn("RAWFILENAME", cube.data.columns)
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/_unittests/ut_torch_models/test_tiny_llms_onnx.py b/_unittests/ut_torch_models/test_tiny_llms_onnx.py
@@ -102,7 +102,9 @@ def test_bypass_onnx_export_tiny_llm_official_full(self):
         self.assertEqual(
             {"attention_mask", "past_key_values", "input_ids", "position_ids"}, set(inputs)
         )
-        with torch_export_patches(patch_transformers=True, verbose=1) as modificator:
+        with torch_export_patches(
+            patch_transformers=True, verbose=1, stop_if_static=1
+        ) as modificator:
             new_inputs = modificator(copy.deepcopy(inputs))
             ep = torch.onnx.export(
                 model,
diff --git a/onnx_diagnostic/helpers/log_helper.py b/onnx_diagnostic/helpers/log_helper.py
@@ -4,9 +4,10 @@
 import re
 import zipfile
 from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Tuple, Union
-from .helper import string_sig
+import numpy as np
 import pandas
 from pandas.api.types import is_numeric_dtype
+from .helper import string_sig
 
 
 def enumerate_csv_files(
@@ -197,6 +198,27 @@ def load(self, verbose: int = 0):
             if verbose:
                 print(f"[CubeLogs.load] load from list of dicts, n={len(self._data)}")
             self.data = pandas.DataFrame(self._data)
+        elif isinstance(self._data, list) and all(
+            isinstance(r, pandas.DataFrame) for r in self._data
+        ):
+            if verbose:
+                print(f"[CubeLogs.load] load from list of DataFrame, n={len(self._data)}")
+            self.data = pandas.concat(self._data, axis=0)
+        elif isinstance(self._data, list):
+            cubes = []
+            for item in enumerate_csv_files(self._data, verbose=verbose):
+                df = open_dataframe(item)
+                cube = CubeLogs(
+                    df,
+                    time=self._time,
+                    keys=self._keys,
+                    values=self._values,
+                    ignored=self._ignored,
+                    recent=self.recent,
+                )
+                cube.load()
+                cubes.append(cube.data)
+            self.data = pandas.concat(cubes, axis=0)
         else:
             raise NotImplementedError(
                 f"Not implemented with the provided data (type={type(self._data)})"
@@ -281,16 +303,25 @@ def _preprocess(self):
         last = self.values[0]
         gr = self.data[[self.time, *self.keys, last]].groupby([self.time, *self.keys]).count()
         gr = gr[gr[last] > 1]
-        assert gr.shape[0] == 0, f"There are duplicated rows:\n{gr}"
         if self.recent:
-            gr = self.data[[*self.keys, self.time]].groupby(self.keys, as_index=False).max()
-            filtered = pandas.merge(self.data, gr, on=[self.time, *self.keys])
+            cp = self.data.copy()
+            assert (
+                "__index__" not in cp.columns
+            ), f"'__index__' should not be a column in {cp.columns}"
+            cp["__index__"] = np.arange(cp.shape[0])
+            gr = (
+                cp[[*self.keys, self.time, "__index__"]]
+                .groupby(self.keys, as_index=False)
+                .max()
+            )
+            filtered = pandas.merge(cp, gr, on=[self.time, "__index__", *self.keys])
             assert filtered.shape[0] <= self.data.shape[0], (
                 f"Keeping the latest row brings more row {filtered.shape} "
                 f"(initial is {self.data.shape})."
             )
-            self.data = filtered
+            self.data = filtered.drop("__index__", axis=1)
         else:
+            assert gr.shape[0] == 0, f"There are duplicated rows:\n{gr}"
             gr = self.data[[*self.keys, self.time]].groupby(self.keys).count()
             gr = gr[gr[self.time] > 1]
             assert (