sdpython
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 60 additions & 0 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎.github/workflows/documentation.yml‎
Lines changed: 4 additions & 3 deletions b/‎.github/workflows/documentation.yml‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎_doc/examples/plot_export_tiny_llm.py‎
Lines changed: 227 additions & 0 deletions b/‎_doc/examples/plot_export_tiny_llm.py‎
Lines changed: 227 additions & 0 deletions
@@ -0,0 +1,60 @@
+name: nightly-ci
+
+on:
+  push:
+  pull_request:
+    types:
+      - closed
+    branches:
+      - main
+
+jobs:
+  run:
+    name: nightly-ci ${{ matrix.os }}-${{ matrix.python }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+        python: ['3.12']
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python }}
+
+      - name: Install pytorch
+        run: python -m pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
+
+      - name: Install transformers
+        run: |
+          git clone https://github.com/huggingface/transformers.git
+          cd transformers
+          pip install -e .
+          cd ..
+
+      - name: Install requirements
+        run: python -m pip install -r requirements.txt
+
+      - name: Install requirements dev
+        run: python -m pip install -r requirements-dev.txt
+
+      - name: Cache pip
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/pip
+          key: ${{ runner.os }}-pip-${{ hashFiles('requirements-dev.txt') }}
+          restore-keys: |
+            ${{ runner.os }}-pip-
+            ${{ runner.os }}-
+
+      - name: pip freeze
+        run: python -m pip freeze
+
+      - name: run tests
+        run: |
+          pip install pytest
+          export PYTHONPATH=.
+          UNITTEST_GOING=1 pytest --durations=10 _unittests
+          export PYTHONPATH=
@@ -1,4 +1,4 @@
-name: Documentation and Code Coverage
+name: Code Coverage and Documentation
 
 on:
   push:
@@ -10,18 +10,19 @@ on:
 
 jobs:
   run:
-    name: Build documentation on ${{ matrix.os }}
+    name: Code Coverage and Documentation on ${{ matrix.os }}-${{ matrix.python }}
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
         os: [ubuntu-latest]
+        python: ['3.12']
 
     steps:
       - uses: actions/checkout@v3
 
       - uses: actions/setup-python@v4
         with:
-          python-version: '3.12'
+          python-version: ${{ matrix.python }}
 
       - uses: tlylt/install-graphviz@v1
 
 
@@ -0,0 +1,227 @@
+"""
+Export LLM with dynamic shapes
+==============================
+
+We focus on the model
+`Tiny-LLM <https://huggingface.co/arnir0/Tiny-LLM>`_.
+To avoid downloading any weigths, we write a function creating a
+random model based on the same architecture.
+
+Guess the cache dimension
++++++++++++++++++++++++++
+
+The first step is to guess the dummy inputs.
+Let's use the true model for that.
+We use the dummy example from the model page.
+"""
+
+from typing import Any, Dict
+import torch
+import transformers
+from onnx_diagnostic.helpers import string_type
+from onnx_diagnostic.cache_helpers import make_dynamic_cache
+
+
+MODEL_NAME = "arnir0/Tiny-LLM"
+tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)
+model = transformers.AutoModelForCausalLM.from_pretrained(MODEL_NAME)
+
+# %%
+# We rewrite the forward method to print the cache dimension.
+
+
+def string_inputs(args, kwargs):
+    def _cache(a):
+        if len(a.key_cache):
+            return f"n_caches={len(a.key_cache)}, shape={a.key_cache[0].shape}"
+        return f"n_caches={len(a.key_cache)}"
+
+    for a in args:
+        if isinstance(a, transformers.cache_utils.DynamicCache):
+            return _cache(a)
+    for k, a in kwargs.items():
+        if isinstance(a, transformers.cache_utils.DynamicCache):
+            return f"{k}={_cache(a)}"
+    return "no_cache"
+
+
+def _forward_(*args, _f=None, **kwargs):
+    assert _f is not None
+    if not torch.compiler.is_exporting():
+        print("<-", string_type((args, kwargs), with_shape=True, with_min_max=True))
+    res = _f(*args, **kwargs)
+    if not torch.compiler.is_exporting():
+        print("->", string_type((args, kwargs), with_shape=True, with_min_max=True))
+    return res
+
+
+keep_model_forward = model.forward
+model.forward = lambda *args, _f=keep_model_forward, **kwargs: _forward_(
+    *args, _f=_f, **kwargs
+)
+
+# %%
+# Let's run the model.
+prompt = "Continue: it rains..."
+inputs = tokenizer.encode(prompt, return_tensors="pt")
+
+outputs = model.generate(
+    inputs, max_length=50, temperature=1, top_k=50, top_p=0.95, do_sample=True
+)
+
+generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+print(generated_text)
+
+# %%
+# Let's restore the forward as it was.
+model.forward = keep_model_forward
+
+# %%
+# The model creation
+# ++++++++++++++++++
+#
+# Let's create an untrained model.
+
+
+def get_tiny_llm(
+    batch_size: int = 2,
+    input_cache: bool = True,
+    common_dynamic_shapes: bool = True,
+    dynamic_rope: bool = False,
+    **kwargs,
+) -> Dict[str, Any]:
+    """
+    Gets a non initialized model.
+
+    :param batch_size: batch size
+    :param input_cache: generate data for this iteration with or without cache
+    :param kwargs: to overwrite the configuration, example ``num_hidden_layers=1``
+    :param common_dynamic_shapes: if True returns dynamic shapes as well
+    :param dynamic_rope: use dynamic rope (see :class:`transformers.LlamaConfig`)
+    :return: dictionary
+    """
+    import transformers
+
+    config = {
+        "architectures": ["LlamaForCausalLM"],
+        "bos_token_id": 1,
+        "eos_token_id": 2,
+        "hidden_act": "silu",
+        "hidden_size": 192,
+        "initializer_range": 0.02,
+        "intermediate_size": 1024,
+        "max_position_embeddings": 1024,
+        "model_type": "llama",
+        "num_attention_heads": 2,
+        "num_hidden_layers": 1,
+        "num_key_value_heads": 1,
+        "pretraining_tp": 1,
+        "rms_norm_eps": 1e-05,
+        "rope_scaling": {"rope_type": "dynamic", "factor": 10.0} if dynamic_rope else None,
+        "tie_word_embeddings": False,
+        "torch_dtype": "float32",
+        "transformers_version": "4.31.0.dev0",
+        "use_cache": True,
+        "vocab_size": 32000,
+    }
+
+    config.update(**kwargs)
+    conf = transformers.LlamaConfig(**config)
+    model = transformers.LlamaForCausalLM(conf)
+    model.eval()
+
+    # now the inputs
+    cache_last_dim = 96
+    sequence_length = 30
+    sequence_length2 = 3
+    num_key_value_heads = 1
+    max_token_id = config["vocab_size"] - 1
+    n_layers = config["num_hidden_layers"]
+
+    batch = torch.export.Dim("batch", min=1, max=1024)
+    seq_length = torch.export.Dim("seq_length", min=1, max=4096)
+    cache_length = torch.export.Dim("cache_length", min=1, max=4096)
+
+    shapes = {
+        "input_ids": {0: batch, 1: seq_length},
+        "attention_mask": {
+            0: batch,
+            1: torch.export.Dim.DYNAMIC,  # cache_length + seq_length
+        },
+        "past_key_values": [
+            [{0: batch, 2: cache_length} for _ in range(n_layers)],
+            [{0: batch, 2: cache_length} for _ in range(n_layers)],
+        ],
+    }
+    inputs = dict(
+        input_ids=torch.randint(0, max_token_id, (batch_size, sequence_length2)).to(
+            torch.int64
+        ),
+        attention_mask=torch.ones((batch_size, sequence_length + sequence_length2)).to(
+            torch.int64
+        ),
+        past_key_values=make_dynamic_cache(
+            [
+                (
+                    torch.randn(
+                        batch_size, num_key_value_heads, sequence_length, cache_last_dim
+                    ),
+                    torch.randn(
+                        batch_size, num_key_value_heads, sequence_length, cache_last_dim
+                    ),
+                )
+                for i in range(n_layers)
+            ]
+        ),
+    )
+    return dict(inputs=inputs, model=model, dynamic_shapes=shapes)
+
+
+# %%
+# Let's get the model, inputs and dynamic shapes.
+
+experiment = get_tiny_llm()
+untrained_model, inputs, dynamic_shapes = (
+    experiment["model"],
+    experiment["inputs"],
+    experiment["dynamic_shapes"],
+)
+
+# %% Let's run it.
+expected_output = model(**inputs)
+print("result type", type(expected_output))
+
+# %%
+# It works.
+#
+# ExportedProgram
+# +++++++++++++++
+
+try:
+    ep = torch.export.export(
+        untrained_model, (), inputs, dynamic_shapes=dynamic_shapes, strict=False
+    )
+    print("It worked:")
+    print(ep)
+except Exception as e:
+    # To work, it needs at least PRs:
+    # * https://github.com/huggingface/transformers/pull/36311
+    # * https://github.com/huggingface/transformers/pull/36652
+    print("It failed:", e)
+
+
+# %%
+# Back to the original model
+# ++++++++++++++++++++++++++
+#
+# Let's use the same dummy inputs but we use the downloaded model.
+
+try:
+    ep = torch.export.export(model, (), inputs, dynamic_shapes=dynamic_shapes, strict=False)
+    print("It worked:")
+    print(ep)
+except Exception as e:
+    # To work, it needs at least PRs:
+    # * https://github.com/huggingface/transformers/pull/36311
+    # * https://github.com/huggingface/transformers/pull/36652
+    print("It failed:", e)