mindspore-lab
diff --git a/‎mindone/transformers/__init__.py‎
Lines changed: 9 additions & 0 deletions b/‎mindone/transformers/__init__.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎mindone/transformers/models/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎mindone/transformers/models/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎mindone/transformers/models/auto/__init__.py‎
Lines changed: 7 additions & 1 deletion b/‎mindone/transformers/models/auto/__init__.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎mindone/transformers/models/auto/processing_auto.py‎
Lines changed: 1 addition & 0 deletions b/‎mindone/transformers/models/auto/processing_auto.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎mindone/transformers/models/idefics/__init__.py‎
Lines changed: 20 additions & 0 deletions b/‎mindone/transformers/models/idefics/__init__.py‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎mindone/transformers/models/idefics/image_processing_idefics.py‎
Lines changed: 191 additions & 0 deletions b/‎mindone/transformers/models/idefics/image_processing_idefics.py‎
Lines changed: 191 additions & 0 deletions
@@ -56,6 +56,7 @@
     AutoModelForCausalLM,
     AutoModelForImageTextToText,
     AutoModelForMaskedLM,
+    AutoModelForVision2Seq,
     AutoProcessor,
 )
 from .models.bart import (
@@ -251,6 +252,14 @@
     HieraPreTrainedModel,
 )
 from .models.hubert import HubertForCTC, HubertForSequenceClassification, HubertModel, HubertPreTrainedModel
+from .models.idefics import (
+    IdeficsForVisionText2Text,
+    IdeficsImageProcessor,
+    IdeficsModel,
+    IdeficsPreTrainedModel,
+    IdeficsProcessor,
+)
+from .models.idefics2 import Idefics2ForConditionalGeneration, Idefics2Model, Idefics2PreTrainedModel
 from .models.idefics3 import (
     Idefics3ForConditionalGeneration,
     Idefics3Model,
 
@@ -47,6 +47,8 @@
     granitemoeshared,
     hiera,
     hubert,
+    idefics,
+    idefics2,
     idefics3,
     ijepa,
     imagegpt,
 
@@ -17,5 +17,11 @@
 from .configuration_auto import AutoConfig
 from .feature_extraction_auto import AutoFeatureExtractor
 from .image_processing_auto import AutoImageProcessor
-from .modeling_auto import AutoModel, AutoModelForCausalLM, AutoModelForImageTextToText, AutoModelForMaskedLM
+from .modeling_auto import (
+    AutoModel,
+    AutoModelForCausalLM,
+    AutoModelForImageTextToText,
+    AutoModelForMaskedLM,
+    AutoModelForVision2Seq,
+)
 from .processing_auto import AutoProcessor
@@ -51,6 +51,7 @@
     [
         ("blip", "BlipProcessor"),
         ("chameleon", "ChameleonProcessor"),
+        ("idefics", "IdeficsProcessor"),
         ("llava_next", "LlavaNextProcessor"),
         ("llava_next_video", "LlavaNextVideoProcessor"),
         ("llava_onevision", "LlavaOnevisionProcessor"),
 
@@ -0,0 +1,20 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# This code is adapted from https://github.com/huggingface/transformers
+# with modifications to run transformers on mindspore.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .image_processing_idefics import *
+from .modeling_idefics import IdeficsForVisionText2Text, IdeficsModel, IdeficsPreTrainedModel
+from .processing_idefics import *
@@ -0,0 +1,191 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# This code is adapted from https://github.com/huggingface/transformers
+# with modifications to run transformers on mindspore.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Idefics."""
+
+from typing import Callable, Optional, Union
+
+from PIL import Image
+
+from mindspore import mint
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature
+from ...image_transforms import resize, to_channel_dimension_format
+from ...image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType
+
+IDEFICS_STANDARD_MEAN = [0.48145466, 0.4578275, 0.40821073]
+IDEFICS_STANDARD_STD = [0.26862954, 0.26130258, 0.27577711]
+
+
+def convert_to_rgb(image):
+    # `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background
+    # for transparent images. The call to `alpha_composite` handles this case
+    if image.mode == "RGB":
+        return image
+
+    image_rgba = image.convert("RGBA")
+    background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
+    alpha_composite = Image.alpha_composite(background, image_rgba)
+    alpha_composite = alpha_composite.convert("RGB")
+    return alpha_composite
+
+
+class IdeficsImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Idefics image processor.
+
+    Args:
+        image_size (`int`, *optional*, defaults to 224):
+            Resize to image size
+        image_mean (`float` or `list[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
+            overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        image_num_channels (`int`, *optional*, defaults to 3):
+            Number of image channels.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        image_size: int = 224,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        image_num_channels: Optional[int] = 3,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+
+        self.image_size = image_size
+        self.image_num_channels = image_num_channels
+        self.image_mean = image_mean if image_mean is not None else IDEFICS_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IDEFICS_STANDARD_STD
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        image_num_channels: Optional[int] = 3,
+        image_size: Optional[dict[str, int]] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        transform: Optional[Callable] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.MINDSPORE,
+        **kwargs,
+    ) -> TensorType:
+        """
+        Preprocess a batch of images.
+
+        Args:
+            images (`ImageInput`):
+                A list of images to preprocess.
+            image_size (`int`, *optional*, defaults to `self.image_size`):
+                Resize to image size
+            image_num_channels (`int`, *optional*, defaults to `self.image_num_channels`):
+                Number of image channels.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`):
+                Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+                channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can
+                be overridden by the `image_mean` parameter in the `preprocess` method.
+            image_std (`float` or `list[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`):
+                Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+                number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess`
+                method. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            transform (`Callable`, *optional*, defaults to `None`):
+                A custom transform function that accepts a single image can be passed for training. For example,
+                `transforms.Compose` can be used to compose multiple transforms. If `None` - an inference mode is
+                assumed - and then a preset of inference-specific transforms will be applied to the images
+            do_rescale (`bool`, *optional*, defaults to `True`):
+                Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+                the `preprocess` method.
+            rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+                Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+                method.
+
+        Returns:
+            a MindSpore tensor of the processed images
+
+        """
+        image_size = image_size if image_size is not None else self.image_size
+        image_num_channels = image_num_channels if image_num_channels is not None else self.image_num_channels
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        size = (image_size, image_size)
+
+        if isinstance(images, list) and len(images) == 0:
+            return []
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "ms.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        # For training a user needs to pass their own set of transforms as a Callable.
+        # For reference this is what was used in the original IDEFICS training:
+        # transform = transforms.Compose([
+        #     convert_to_rgb,
+        #     vision.RandomResizedCrop((size, size), scale=(0.9, 1.0), interpolation=vision.Inter.BICUBIC),
+        #     vision.ToTensor(),
+        #     vision.Normalize(mean=image_mean, std=image_std),
+        # ])
+        if transform is not None:
+            images = [transform(x) for x in images]
+            return mint.stack(images)
+
+        # for inference we do the exact transforms that were used to train IDEFICS
+        images = [convert_to_rgb(x) for x in images]
+        # further transforms expect numpy arrays
+        images = [to_numpy_array(x) for x in images]
+        images = [resize(x, size, resample=PILImageResampling.BICUBIC) for x in images]
+        images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+        images = [self.normalize(x, mean=image_mean, std=image_std) for x in images]
+        images = [to_channel_dimension_format(x, ChannelDimension.FIRST) for x in images]
+        images = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)["pixel_values"]
+
+        return images
+
+
+__all__ = ["IdeficsImageProcessor"]
Original file line number	Diff line number	Diff line change
`@@ -51,6 +51,7 @@`
`51`	`51`	`[`
`52`	`52`	`("blip", "BlipProcessor"),`
`53`	`53`	`("chameleon", "ChameleonProcessor"),`
	`54`	`+ ("idefics", "IdeficsProcessor"),`
`54`	`55`	`("llava_next", "LlavaNextProcessor"),`
`55`	`56`	`("llava_next_video", "LlavaNextVideoProcessor"),`
`56`	`57`	`("llava_onevision", "LlavaOnevisionProcessor"),`