fix issues

xadupre · xadupre · commit eb15c53975a0 · 2025-06-16T11:43:05.000+02:00
diff --git a/_unittests/ut_helpers/test_log_helper.py b/_unittests/ut_helpers/test_log_helper.py
@@ -1,9 +1,16 @@
 import io
+import os
 import textwrap
 import unittest
+import zipfile
 import pandas
 from onnx_diagnostic.ext_test_case import ExtTestCase, hide_stdout
-from onnx_diagnostic.helpers.log_helper import CubeLogs, CubeViewDef
+from onnx_diagnostic.helpers.log_helper import (
+    CubeLogs,
+    CubeViewDef,
+    enumerate_csv_files,
+    open_dataframe,
+)
 
 
 class TestLogHelper(ExtTestCase):
@@ -152,6 +159,21 @@ def test_cube_logs_excel(self):
         )
         self.assertExists(output)
 
+    def test_enumerate_csv_files(self):
+        df = self.df1()
+        filename = self.get_dump_file("test_enumerate_csv_files.csv")
+        df.to_csv(filename, index=False)
+        zip_file = self.get_dump_file("test_enumerate_csv_files.zip")
+        with zipfile.ZipFile(zip_file, "w", zipfile.ZIP_DEFLATED) as zipf:
+            zipf.write(filename)
+
+        dirname = os.path.dirname(filename)
+        data = [os.path.join(dirname, "*.csv"), os.path.join(dirname, "*.zip")]
+        dfs = list(enumerate_csv_files(data, verbose=1))
+        self.assertNotEmpty(dfs)
+        for df in dfs:
+            open_dataframe(df)
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/_unittests/ut_helpers/test_ort_session_tinyllm.py b/_unittests/ut_helpers/test_ort_session_tinyllm.py
@@ -82,11 +82,10 @@ def test_check_allruntimes_on_tiny_llm(self):
         model, inputs, ds = data["model"], data["inputs"], data["dynamic_shapes"]
         expected = model(**copy.deepcopy(inputs))
 
-        with torch_export_patches(patch_transformers=True):
+        with torch_export_patches(patch_transformers=True, stop_if_static=1):
             if to_onnx:
                 proto = to_onnx(model, (), kwargs=copy.deepcopy(inputs), dynamic_shapes=ds)
             else:
-                stop
                 proto = torch.onnx.export(
                     model, (), kwargs=copy.deepcopy(inputs), dynamic_shapes=ds, dynamo=True
                 ).model_proto
diff --git a/onnx_diagnostic/helpers/log_helper.py b/onnx_diagnostic/helpers/log_helper.py
@@ -1,10 +1,132 @@
+import datetime
+import glob
+import os
 import re
-from typing import Any, Callable, Dict, Optional, Sequence, Tuple
+import zipfile
+from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Tuple, Union
 from .helper import string_sig
 import pandas
 from pandas.api.types import is_numeric_dtype
 
 
+def enumerate_csv_files(
+    data: Union[
+        pandas.DataFrame, List[Union[str, Tuple[str, str]]], str, Tuple[str, str, str, str]
+    ],
+    verbose: int = 0,
+) -> Iterator[Union[pandas.DataFrame, str, Tuple[str, str, str, str]]]:
+    """
+    Enumerates files considered for the aggregation.
+    Only csv files are considered.
+    If a zip file is given, the function digs into the zip files and
+    loops over csv candidates.
+
+    :param data: dataframe with the raw data or a file or list of files
+
+    data can contains:
+    * a dataframe
+    * a string for a filename, zip or csv
+    * a list of string
+    * a tuple
+    """
+    if not isinstance(data, list):
+        data = [data]
+    for itn, filename in enumerate(data):
+        if isinstance(filename, pandas.DataFrame):
+            if verbose:
+                print(f"[enumerate_csv_files] data[{itn}] is a dataframe")
+            yield filename
+            continue
+
+        if isinstance(filename, tuple):
+            # A file in a zipfile
+            if verbose:
+                print(f"[enumerate_csv_files] data[{itn}] is {filename!r}")
+            yield filename
+            continue
+
+        if os.path.exists(filename):
+            ext = os.path.splitext(filename)[-1]
+            if ext == ".csv":
+                # We check the first line is ok.
+                if verbose:
+                    print(f"[enumerate_csv_files] data[{itn}] is a csv file: {filename!r}]")
+                with open(filename, "r", encoding="utf-8") as f:
+                    line = f.readline()
+                    if "~help" in line or (",CMD" not in line and ",DATE" not in line):
+                        continue
+                    dt = datetime.datetime.fromtimestamp(os.stat(filename).st_mtime)
+                    du = dt.strftime("%Y-%m-%d %H:%M:%S")
+                    yield (os.path.split(filename)[-1], du, filename, "")
+                continue
+
+            if ext == ".zip":
+                if verbose:
+                    print(f"[enumerate_csv_files] data[{itn}] is a zip file: {filename!r}]")
+                zf = zipfile.ZipFile(filename, "r")
+                for ii, info in enumerate(zf.infolist()):
+                    name = info.filename
+                    ext = os.path.splitext(name)[-1]
+                    if ext != ".csv":
+                        continue
+                    if verbose:
+                        print(
+                            f"[enumerate_csv_files] data[{itn}][{ii}] is a csv file: {name!r}]"
+                        )
+                    with zf.open(name) as f:
+                        line = f.readline()
+                    yield (
+                        os.path.split(name)[-1],
+                        "%04d-%02d-%02d %02d:%02d:%02d" % info.date_time,
+                        name,
+                        filename,
+                    )
+                zf.close()
+                continue
+
+            raise AssertionError(f"Unexpected format {filename!r}, cannot read it.")
+
+        # filename is a pattern.
+        found = glob.glob(filename)
+        if verbose and not found:
+            print(f"[enumerate_csv_files] unable to find file in {filename!r}")
+        for ii, f in enumerate(found):
+            if verbose:
+                print(f"[enumerate_csv_files] data[{itn}][{ii}] {f!r} from {filename!r}")
+            yield from enumerate_csv_files(f, verbose=verbose)
+
+
+def open_dataframe(
+    data: Union[str, Tuple[str, str, str, str], pandas.DataFrame],
+) -> pandas.DataFrame:
+    """
+    Opens a filename.
+
+    :param data: a dataframe, a filename, a tuple indicating the file is coming
+        from a zip file
+    :return: a dataframe
+    """
+    if isinstance(data, pandas.DataFrame):
+        return data
+    if isinstance(data, str):
+        df = pandas.read_csv(data)
+        df["RAWFILENAME"] = data
+        return df
+    if isinstance(data, tuple):
+        if not data[-1]:
+            df = pandas.read_csv(data[2])
+            df["RAWFILENAME"] = data[2]
+            return df
+        zf = zipfile.ZipFile(data[-1])
+        with zf.open(data[2]) as f:
+            df = pandas.read_csv(f)
+            df["RAWFILENAME"] = f"{data[-1]}/{data[2]}"
+        zf.close()
+        return df
+
+    raise ValueError(f"Unexpected value for data: {data!r}")
+
+
 class CubeViewDef:
     """
     Defines how to compute a view.
diff --git a/onnx_diagnostic/torch_models/untrained/llm_tiny_llm.py b/onnx_diagnostic/torch_models/untrained/llm_tiny_llm.py
@@ -58,8 +58,8 @@ def get_tiny_llm(
     num_key_value_heads = config["num_key_value_heads"]
 
     batch = torch.export.Dim("batch", min=1, max=1024)
-    seq_length = torch.export.Dim("seq_length", min=1, max=4096)
-    cache_length = torch.export.Dim("cache_length", min=1, max=4096)
+    seq_length = torch.export.Dim("seq_length", min=1, max=8192)
+    cache_length = torch.export.Dim("cache_length", min=1, max=8192)
 
     shapes = {
         "input_ids": {0: batch, 1: seq_length},