first step for moe

xadupre · xadupre · commit 87568cb7811b · 2025-04-24T09:41:39.000+02:00
diff --git a/_unittests/ut_tasks/try_tasks.py b/_unittests/ut_tasks/try_tasks.py
@@ -99,8 +99,8 @@ def test_text2text_generation(self):
         print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))
 
     @never_test()
-    def test_text_generation_phi4(self):
-        # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k phi4
+    def test_text_generation_phi4_mini(self):
+        # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k phi4_mini
 
         import torch
         from transformers import RobertaTokenizer, T5ForConditionalGeneration
@@ -124,6 +124,103 @@ def test_text_generation_phi4(self):
             )
         print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))
 
+    @never_test()
+    def test_text_generation_phi4_moe(self):
+        # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k phi4_moe
+
+        import requests
+        import io
+        from PIL import Image
+        import soundfile as sf
+        from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
+        from urllib.request import urlopen
+
+        # Define model path
+        model_path = "microsoft/Phi-4-multimodal-instruct"
+
+        # Load model and processor
+        processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            device_map="cuda",
+            torch_dtype="auto",
+            trust_remote_code=True,
+            # if you do not use Ampere or later GPUs, change attention to "eager"
+            # _attn_implementation='flash_attention_2',
+            _attn_implementation="eager",
+        ).cuda()
+
+        # Load generation config
+        generation_config = GenerationConfig.from_pretrained(model_path)
+
+        # Define prompt structure
+        user_prompt = "<|user|>"
+        assistant_prompt = "<|assistant|>"
+        prompt_suffix = "<|end|>"
+
+        # Part 1: Image Processing
+        print("\n--- IMAGE PROCESSING ---")
+        image_url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        prompt = (
+            f"{user_prompt}<|image_1|>What is shown in this image"
+            f"?{prompt_suffix}{assistant_prompt}"
+        )
+        print(f">>> Prompt\n{prompt}")
+
+        # Download and open image
+        image = Image.open(requests.get(image_url, stream=True).raw)
+        inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda:0")
+
+        # Generate response
+        print("--------- IMAGE PROCESSING ----------")
+        print()
+        with steal_forward(model):
+            generate_ids = model.generate(
+                **inputs,
+                max_new_tokens=1000,
+                generation_config=generation_config,
+            )
+        generate_ids = generate_ids[:, inputs["input_ids"].shape[1] :]
+        response = processor.batch_decode(
+            generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )[0]
+        print(f">>> Response\n{response}")
+
+        # Part 2: Audio Processing
+        print("\n--- AUDIO PROCESSING ---")
+        audio_url = (
+            "https://upload.wikimedia.org/wikipedia/commons/b/b0/"
+            "Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac"
+        )
+        speech_prompt = (
+            "Transcribe the audio to text, and then translate the audio to French. "
+            "Use <sep> as a separator between the original transcript and the translation."
+        )
+        prompt = f"{user_prompt}<|audio_1|>{speech_prompt}{prompt_suffix}{assistant_prompt}"
+        print(f">>> Prompt\n{prompt}")
+
+        # Downlowd and open audio file
+        audio, samplerate = sf.read(io.BytesIO(urlopen(audio_url).read()))
+
+        # Process with the model
+        inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors="pt").to(
+            "cuda:0"
+        )
+
+        print("--------- AUDIO PROCESSING ----------")
+        print()
+        with steal_forward(model):
+            generate_ids = model.generate(
+                **inputs,
+                max_new_tokens=1000,
+                generation_config=generation_config,
+            )
+        generate_ids = generate_ids[:, inputs["input_ids"].shape[1] :]
+        response = processor.batch_decode(
+            generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )[0]
+        print(f">>> Response\n{response}")
+
     @never_test()
     def test_imagetext2text_generation(self):
         # clear&&NEVERTEST=1 python _unittests/ut_tasks/try_tasks.py -k etext2t
diff --git a/onnx_diagnostic/tasks/__init__.py b/onnx_diagnostic/tasks/__init__.py
@@ -4,6 +4,7 @@
     fill_mask,
     image_classification,
     image_text_to_text,
+    mixture_of_expert,
     sentence_similarity,
     text_classification,
     text_generation,
@@ -16,6 +17,7 @@
     fill_mask,
     image_classification,
     image_text_to_text,
+    mixture_of_expert,
     sentence_similarity,
     text_classification,
     text_generation,
diff --git a/onnx_diagnostic/tasks/mixture_of_expert.py b/onnx_diagnostic/tasks/mixture_of_expert.py
@@ -0,0 +1,154 @@
+from typing import Any, Callable, Dict, Optional, Tuple
+import torch
+from ..helpers.cache_helper import make_dynamic_cache
+from ..helpers.config_helper import update_config, check_hasattr, _pick
+
+__TASK__ = "MoE"
+
+
+def reduce_model_config(config: Any) -> Dict[str, Any]:
+    """Reduces a model size."""
+    kwargs: Dict[str, Any] = {}
+    if hasattr(config, "num_hidden_layers"):
+        config.num_hidden_layers = min(config.num_hidden_layers, 2)
+    if hasattr(config, "vision_config") and hasattr(config.vision_config, "num_hidden_layers"):
+        config.vision_config.num_hidden_layers = min(config.vision_config.num_hidden_layers, 2)
+    if hasattr(config, "audio_processor") and hasattr(
+        config.audio_processor, "num_hidden_layers"
+    ):
+        config.audio_processor.num_hidden_layers = min(
+            config.audio_processor.num_hidden_layers, 2
+        )
+    if hasattr(config, "audio_processor") and hasattr(config.audio_processor, "attention_dim"):
+        config.audio_processor.attention_dim = min(config.audio_processor.attention_dim, 2)
+    update_config(config, kwargs)
+    return kwargs
+
+
+def get_inputs(
+    model: torch.nn.Module,
+    config: Optional[Any],
+    dummy_max_token_id: int,
+    num_key_value_heads: int,
+    num_hidden_layers: int,
+    head_dim: int,
+    width: int,
+    height: int,
+    num_channels: int,
+    batch_size: int = 2,
+    sequence_length: int = 30,
+    sequence_length2: int = 3,
+    n_images: int = 2,
+    dynamic_rope: bool = False,
+    **kwargs,  # unused
+):
+    """
+    Generates input for task ``text-generation``.
+
+    :param model: model to get the missing information
+    :param config: configuration used to generate the model
+    :param head_dim: last dimension of the cache
+    :param dummy_max_token_id: dummy max token id
+    :param batch_size: batch size
+    :param sequence_length: sequence length
+    :param sequence_length2: new sequence length
+    :param n_images: number of images
+    :param width: width of the image
+    :param height: height of the image
+    :param num_channels: number of channels
+    :param dynamic_rope: use dynamic rope (see :class:`transformers.LlamaConfig`)
+    :return: dictionary
+    """
+    batch = torch.export.Dim("batch", min=1, max=1024)
+    seq_length = "seq_length"  # torch.export.Dim("seq_length", min=1, max=4096)
+    cache_length = "cache_length"  # torch.export.Dim("cache_length", min=1, max=4096)
+    images = "images"  # torch.export.Dim("images", min=1, max=4096)
+
+    shapes = {
+        "input_ids": {0: batch, 1: seq_length},
+        "attention_mask": {
+            0: batch,
+            1: "cache+seq",  # cache_length + seq_length
+        },
+        "position_ids": {
+            0: batch,
+            1: "cache+seq",  # cache_length + seq_length
+        },
+        "past_key_values": [
+            [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
+            [{0: batch, 2: cache_length} for _ in range(num_hidden_layers)],
+        ],
+        "pixel_values": {0: batch, 1: images},
+        "image_attention_mask": {0: batch, 1: seq_length, 2: images},
+    }
+    inputs = dict(
+        input_ids=torch.randint(0, dummy_max_token_id, (batch_size, sequence_length2)).to(
+            torch.int64
+        ),
+        attention_mask=torch.ones((batch_size, sequence_length + sequence_length2)).to(
+            torch.int64
+        ),
+        position_ids=torch.arange(sequence_length, sequence_length + sequence_length2)
+        .to(torch.int64)
+        .expand((batch_size, -1)),
+        past_key_values=make_dynamic_cache(
+            [
+                (
+                    torch.randn(batch_size, num_key_value_heads, sequence_length, head_dim),
+                    torch.randn(batch_size, num_key_value_heads, sequence_length, head_dim),
+                )
+                for i in range(num_hidden_layers)
+            ]
+        ),
+        image_attention_mask=torch.ones((batch_size, sequence_length2, n_images)).to(
+            torch.int64
+        ),
+        pixel_values=torch.ones((batch_size, n_images, num_channels, width, height)).to(
+            torch.int64
+        ),
+    )
+    return dict(inputs=inputs, dynamic_shapes=shapes)
+
+
+def random_input_kwargs(config: Any) -> Tuple[Dict[str, Any], Callable]:
+    """
+    Inputs kwargs.
+
+    If the configuration is None, the function selects typical dimensions.
+    """
+    if config is not None:
+        check_hasattr(
+            config,
+            "vocab_size",
+            "hidden_size",
+            "num_attention_heads",
+            ("num_key_value_heads", "num_attention_heads"),
+            "intermediate_size",
+            "hidden_size",
+            "vision_config",
+            "audio_processor",
+        )
+        check_hasattr(config.vision_config, "image_size", "num_channels")
+    kwargs = dict(
+        batch_size=2,
+        sequence_length=30,
+        sequence_length2=3,
+        head_dim=(
+            16
+            if config is None
+            else getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        ),
+        dummy_max_token_id=31999 if config is None else config.vocab_size - 1,
+        num_hidden_layers=4 if config is None else config.num_hidden_layers,
+        num_key_value_heads=(
+            8
+            if config is None
+            else _pick(config, "num_key_value_heads", "num_attention_heads")
+        ),
+        intermediate_size=1024 if config is None else config.intermediate_size,
+        hidden_size=512 if config is None else config.hidden_size,
+        width=224 if config is None else config.vision_config.image_size,
+        height=224 if config is None else config.vision_config.image_size,
+        num_channels=3 if config is None else config.vision_config.num_channels,
+    )
+    return kwargs, get_inputs
diff --git a/onnx_diagnostic/torch_models/hghub/hub_data.py b/onnx_diagnostic/torch_models/hghub/hub_data.py
@@ -76,6 +76,7 @@
     MobileNetV2Model,image-feature-extraction
     MobileViTForImageClassification,image-classification
     ModernBertForMaskedLM,fill-mask
+    Phi4MMForCausalLM,MoE
     MoonshineForConditionalGeneration,automatic-speech-recognition
     MptForCausalLM,text-generation
     MusicgenForConditionalGeneration,text-to-audio