[NEW Features] feature_extraction and processor support from_pretrained (#3453)

JunnYu · web-flow · commit a6b46912d26f · 2022-10-13T19:33:31.000+08:00
* update

* add import
diff --git a/paddlenlp/transformers/__init__.py b/paddlenlp/transformers/__init__.py
@@ -14,6 +14,8 @@
 
 from .model_utils import PretrainedModel, register_base_model
 from .tokenizer_utils import PretrainedTokenizer, BPETokenizer, tokenize_chinese_chars, is_chinese_char, AddedToken, normalize_chars, tokenize_special_chars, convert_to_unicode
+from .processing_utils import ProcessorMixin
+from .feature_extraction_utils import BatchFeature, FeatureExtractionMixin
 from .attention_utils import create_bigbird_rand_mask_idx_list
 from .export import export_model
 
@@ -104,7 +106,7 @@
 from .unified_transformer.modeling import *
 from .unified_transformer.tokenizer import *
 from .ernie_vil.modeling import *
-from .ernie_vil.procesing import *
+from .ernie_vil.feature_extraction import *
 from .ernie_vil.tokenizer import *
 from .ernie_vil.procesing import *
 from .unimo.modeling import *
diff --git a/paddlenlp/transformers/clip/feature_extraction.py b/paddlenlp/transformers/clip/feature_extraction.py
@@ -20,7 +20,9 @@
 import numpy as np
 import PIL.Image
 from PIL import Image
-from ..feature_extraction_utils import BatchFeature
+
+from ..feature_extraction_utils import BatchFeature, FeatureExtractionMixin
+
 from ..tokenizer_utils_base import TensorType
 from ..image_utils import ImageFeatureExtractionMixin
 
@@ -34,7 +36,10 @@
 __all__ = ["CLIPFeatureExtractor"]
 
 
-class CLIPFeatureExtractor(ImageFeatureExtractionMixin):
+class CLIPFeatureExtractor(
+        FeatureExtractionMixin,
+        ImageFeatureExtractionMixin,
+):
     r"""
     Constructs a CLIP feature extractor.
     This feature extractor inherits from [`ImageFeatureExtractionMixin`] which contains most of the main methods. Users
diff --git a/paddlenlp/transformers/clip/modeling.py b/paddlenlp/transformers/clip/modeling.py
@@ -1119,6 +1119,12 @@ def __init__(self,
                                           normalize_before=True)
         self.apply(self._init_weights)
 
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.text_model.token_embedding
+
+    def set_input_embeddings(self, value):
+        self.text_model.token_embedding = value
+
     def forward(
         self,
         input_ids=None,
@@ -1262,6 +1268,9 @@ def __init__(self,
 
         self.apply(self._init_weights)
 
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.vision_model.conv1
+
     def forward(
         self,
         pixel_values=None,
diff --git a/paddlenlp/transformers/clip/procesing.py b/paddlenlp/transformers/clip/procesing.py
@@ -17,13 +17,12 @@
 """
 
 from ..tokenizer_utils_base import BatchEncoding
-from .tokenizer import CLIPTokenizer
-from .feature_extraction import CLIPFeatureExtractor
+from ..processing_utils import ProcessorMixin
 
 __all__ = ["CLIPProcessor"]
 
 
-class CLIPProcessor(object):
+class CLIPProcessor(ProcessorMixin):
     r"""
     Constructs a CLIP processor which wraps a CLIP feature extractor and a CLIP tokenizer into a single processor.
     [`CLIPProcessor`] offers all the functionalities of [`CLIPFeatureExtractor`] and [`CLIPTokenizer`]. See the
@@ -34,11 +33,12 @@ class CLIPProcessor(object):
         tokenizer ([`CLIPTokenizer`]):
             The tokenizer is a required input.
     """
+    feature_extractor_class = "CLIPFeatureExtractor"
+    tokenizer_class = "CLIPTokenizer"
 
     def __init__(self, feature_extractor, tokenizer):
-        super().__init__()
-        self.tokenizer = tokenizer
-        self.feature_extractor = feature_extractor
+        super().__init__(feature_extractor, tokenizer)
+        self.current_processor = self.feature_extractor
 
     def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
         """
@@ -105,15 +105,3 @@ def decode(self, *args, **kwargs):
         the docstring of this method for more information.
         """
         return self.tokenizer.decode(*args, **kwargs)
-
-    # TODO junnyu find a better way from_pretrained and save_pretrained
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
-        tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_name_or_path,
-                                                  *args, **kwargs)
-        feature_extractor = CLIPFeatureExtractor()
-        return cls(feature_extractor, tokenizer)
-
-    def save_pretrained(self, save_directory, filename_prefix=None, **kwargs):
-        return self.tokenizer.save_pretrained(save_directory, filename_prefix,
-                                              **kwargs)
diff --git a/paddlenlp/transformers/ernie_vil/feature_extraction.py b/paddlenlp/transformers/ernie_vil/feature_extraction.py
@@ -22,7 +22,7 @@
 import PIL.Image
 from PIL import Image
 
-from ..feature_extraction_utils import BatchFeature
+from ..feature_extraction_utils import BatchFeature, FeatureExtractionMixin
 from ..tokenizer_utils_base import TensorType
 from ..image_utils import ImageFeatureExtractionMixin
 
@@ -36,7 +36,8 @@
 __all__ = ["ErnieViLFeatureExtractor"]
 
 
-class ErnieViLFeatureExtractor(ImageFeatureExtractionMixin):
+class ErnieViLFeatureExtractor(FeatureExtractionMixin,
+                               ImageFeatureExtractionMixin):
     r"""
     Constructs a ErnieViL feature extractor.
     This feature extractor inherits from [`ImageFeatureExtractionMixin`] which contains most of the main methods. Users
@@ -66,6 +67,12 @@ class ErnieViLFeatureExtractor(ImageFeatureExtractionMixin):
     """
 
     model_input_names = ["pixel_values"]
+    pretrained_feature_extractor_file = {
+        "ernie_vil-2.0-base-zh":
+        "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_vil/ernie_vil-2.0-base-zh/preprocessor_config.json",
+        "disco_diffusion_ernie_vil-2.0-base-zh":
+        "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_vil/disco_diffusion_ernie_vil-2.0-base-zh/preprocessor_config.json",
+    }
 
     def __init__(self,
                  do_resize=True,
diff --git a/paddlenlp/transformers/ernie_vil/procesing.py b/paddlenlp/transformers/ernie_vil/procesing.py
@@ -17,13 +17,12 @@
 """
 
 from ..tokenizer_utils_base import BatchEncoding
-from .tokenizer import ErnieViLTokenizer
-from .feature_extraction import ErnieViLFeatureExtractor
+from ..processing_utils import ProcessorMixin
 
 __all__ = ["ErnieViLProcessor"]
 
 
-class ErnieViLProcessor(object):
+class ErnieViLProcessor(ProcessorMixin):
     r"""
     Constructs a ErnieViL processor which wraps a ErnieViL feature extractor and a ErnieViL tokenizer into a single processor.
     [`ErnieViLProcessor`] offers all the functionalities of [`ErnieViLFeatureExtractor`] and [`ErnieViLTokenizer`]. See the
@@ -34,11 +33,12 @@ class ErnieViLProcessor(object):
         tokenizer ([`ErnieViLTokenizer`]):
             The tokenizer is a required input.
     """
+    feature_extractor_class = "ErnieViLFeatureExtractor"
+    tokenizer_class = "ErnieViLTokenizer"
 
     def __init__(self, feature_extractor, tokenizer):
-        super().__init__()
-        self.tokenizer = tokenizer
-        self.feature_extractor = feature_extractor
+        super().__init__(feature_extractor, tokenizer)
+        self.current_processor = self.feature_extractor
 
     def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
         """
@@ -105,15 +105,3 @@ def decode(self, *args, **kwargs):
         the docstring of this method for more information.
         """
         return self.tokenizer.decode(*args, **kwargs)
-
-    # TODO junnyu find a better way from_pretrained and save_pretrained
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
-        tokenizer = ErnieViLTokenizer.from_pretrained(
-            pretrained_model_name_or_path, *args, **kwargs)
-        feature_extractor = ErnieViLFeatureExtractor()
-        return cls(feature_extractor, tokenizer)
-
-    def save_pretrained(self, save_directory, filename_prefix=None, **kwargs):
-        return self.tokenizer.save_pretrained(save_directory, filename_prefix,
-                                              **kwargs)
diff --git a/paddlenlp/transformers/feature_extraction_utils.py b/paddlenlp/transformers/feature_extraction_utils.py
diff --git a/paddlenlp/transformers/processing_utils.py b/paddlenlp/transformers/processing_utils.py