Skip to content

Commit 46ff896

Browse files
authored
feature (transformer): Swin (#1173)
* init * licenses * update test script * correct import * correct import * example * updates * add an example in docstring * fxi ci error
1 parent edede6c commit 46ff896

File tree

7 files changed

+1809
-1
lines changed

7 files changed

+1809
-1
lines changed

mindone/transformers/__init__.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,14 +49,92 @@
4949
AriaTextPreTrainedModel,
5050
)
5151
from .models.auto import (
52+
MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
53+
MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING,
54+
MODEL_FOR_AUDIO_XVECTOR_MAPPING,
55+
MODEL_FOR_BACKBONE_MAPPING,
56+
MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
57+
MODEL_FOR_CAUSAL_LM_MAPPING,
58+
MODEL_FOR_CTC_MAPPING,
59+
MODEL_FOR_DEPTH_ESTIMATION_MAPPING,
60+
MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
61+
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
62+
MODEL_FOR_IMAGE_MAPPING,
63+
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
64+
MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING,
65+
MODEL_FOR_IMAGE_TO_IMAGE_MAPPING,
66+
MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
67+
MODEL_FOR_KEYPOINT_DETECTION_MAPPING,
68+
MODEL_FOR_MASK_GENERATION_MAPPING,
69+
MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
70+
MODEL_FOR_MASKED_LM_MAPPING,
71+
MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
72+
MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
73+
MODEL_FOR_OBJECT_DETECTION_MAPPING,
74+
MODEL_FOR_PRETRAINING_MAPPING,
75+
MODEL_FOR_QUESTION_ANSWERING_MAPPING,
76+
MODEL_FOR_RETRIEVAL_MAPPING,
77+
MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING,
78+
MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
79+
MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
80+
MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
81+
MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
82+
MODEL_FOR_TEXT_ENCODING_MAPPING,
83+
MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING,
84+
MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING,
85+
MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING,
86+
MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING,
87+
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
88+
MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING,
89+
MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
90+
MODEL_FOR_VISION_2_SEQ_MAPPING,
91+
MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING,
92+
MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING,
93+
MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING,
94+
MODEL_MAPPING,
95+
MODEL_WITH_LM_HEAD_MAPPING,
96+
AutoBackbone,
5297
AutoConfig,
5398
AutoFeatureExtractor,
5499
AutoImageProcessor,
55100
AutoModel,
101+
AutoModelForAudioClassification,
102+
AutoModelForAudioFrameClassification,
103+
AutoModelForAudioXVector,
56104
AutoModelForCausalLM,
105+
AutoModelForCTC,
106+
AutoModelForDepthEstimation,
107+
AutoModelForDocumentQuestionAnswering,
108+
AutoModelForImageClassification,
109+
AutoModelForImageSegmentation,
57110
AutoModelForImageTextToText,
111+
AutoModelForImageToImage,
112+
AutoModelForInstanceSegmentation,
113+
AutoModelForKeypointDetection,
114+
AutoModelForMaskedImageModeling,
58115
AutoModelForMaskedLM,
116+
AutoModelForMaskGeneration,
117+
AutoModelForMultipleChoice,
118+
AutoModelForNextSentencePrediction,
119+
AutoModelForObjectDetection,
120+
AutoModelForPreTraining,
121+
AutoModelForQuestionAnswering,
122+
AutoModelForSemanticSegmentation,
123+
AutoModelForSeq2SeqLM,
124+
AutoModelForSequenceClassification,
125+
AutoModelForSpeechSeq2Seq,
126+
AutoModelForTableQuestionAnswering,
127+
AutoModelForTextEncoding,
128+
AutoModelForTextToSpectrogram,
129+
AutoModelForTextToWaveform,
130+
AutoModelForTokenClassification,
131+
AutoModelForUniversalSegmentation,
132+
AutoModelForVideoClassification,
59133
AutoModelForVision2Seq,
134+
AutoModelForVisualQuestionAnswering,
135+
AutoModelForZeroShotImageClassification,
136+
AutoModelForZeroShotObjectDetection,
137+
AutoModelWithLMHead,
60138
AutoProcessor,
61139
)
62140
from .models.bart import (
@@ -470,6 +548,13 @@
470548
Starcoder2Model,
471549
Starcoder2PreTrainedModel,
472550
)
551+
from .models.swin import (
552+
SwinBackbone,
553+
SwinForImageClassification,
554+
SwinForMaskedImageModeling,
555+
SwinModel,
556+
SwinPreTrainedModel,
557+
)
473558
from .models.switch_transformers import (
474559
SwitchTransformersEncoderModel,
475560
SwitchTransformersForConditionalGeneration,

mindone/transformers/models/auto/__init__.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,88 @@
1818
from .feature_extraction_auto import AutoFeatureExtractor
1919
from .image_processing_auto import AutoImageProcessor
2020
from .modeling_auto import (
21+
MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING,
22+
MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING,
23+
MODEL_FOR_AUDIO_XVECTOR_MAPPING,
24+
MODEL_FOR_BACKBONE_MAPPING,
25+
MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING,
26+
MODEL_FOR_CAUSAL_LM_MAPPING,
27+
MODEL_FOR_CTC_MAPPING,
28+
MODEL_FOR_DEPTH_ESTIMATION_MAPPING,
29+
MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING,
30+
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
31+
MODEL_FOR_IMAGE_MAPPING,
32+
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
33+
MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING,
34+
MODEL_FOR_IMAGE_TO_IMAGE_MAPPING,
35+
MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
36+
MODEL_FOR_KEYPOINT_DETECTION_MAPPING,
37+
MODEL_FOR_MASK_GENERATION_MAPPING,
38+
MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING,
39+
MODEL_FOR_MASKED_LM_MAPPING,
40+
MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
41+
MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
42+
MODEL_FOR_OBJECT_DETECTION_MAPPING,
43+
MODEL_FOR_PRETRAINING_MAPPING,
44+
MODEL_FOR_QUESTION_ANSWERING_MAPPING,
45+
MODEL_FOR_RETRIEVAL_MAPPING,
46+
MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING,
47+
MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
48+
MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
49+
MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
50+
MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
51+
MODEL_FOR_TEXT_ENCODING_MAPPING,
52+
MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING,
53+
MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING,
54+
MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING,
55+
MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING,
56+
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
57+
MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING,
58+
MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
59+
MODEL_FOR_VISION_2_SEQ_MAPPING,
60+
MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING,
61+
MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING,
62+
MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING,
63+
MODEL_MAPPING,
64+
MODEL_WITH_LM_HEAD_MAPPING,
65+
AutoBackbone,
2166
AutoModel,
67+
AutoModelForAudioClassification,
68+
AutoModelForAudioFrameClassification,
69+
AutoModelForAudioXVector,
2270
AutoModelForCausalLM,
71+
AutoModelForCTC,
72+
AutoModelForDepthEstimation,
73+
AutoModelForDocumentQuestionAnswering,
74+
AutoModelForImageClassification,
75+
AutoModelForImageSegmentation,
2376
AutoModelForImageTextToText,
77+
AutoModelForImageToImage,
78+
AutoModelForInstanceSegmentation,
79+
AutoModelForKeypointDetection,
80+
AutoModelForMaskedImageModeling,
2481
AutoModelForMaskedLM,
82+
AutoModelForMaskGeneration,
83+
AutoModelForMultipleChoice,
84+
AutoModelForNextSentencePrediction,
85+
AutoModelForObjectDetection,
86+
AutoModelForPreTraining,
87+
AutoModelForQuestionAnswering,
88+
AutoModelForSemanticSegmentation,
89+
AutoModelForSeq2SeqLM,
90+
AutoModelForSequenceClassification,
91+
AutoModelForSpeechSeq2Seq,
92+
AutoModelForTableQuestionAnswering,
93+
AutoModelForTextEncoding,
94+
AutoModelForTextToSpectrogram,
95+
AutoModelForTextToWaveform,
96+
AutoModelForTokenClassification,
97+
AutoModelForUniversalSegmentation,
98+
AutoModelForVideoClassification,
2599
AutoModelForVision2Seq,
100+
AutoModelForVisualQuestionAnswering,
101+
AutoModelForZeroShotImageClassification,
102+
AutoModelForZeroShotObjectDetection,
103+
AutoModelWithLMHead,
26104
)
27105
from .processing_auto import AutoProcessor

mindone/transformers/models/auto/configuration_auto.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@
100100
("roberta", "RobertaConfig"),
101101
("recurrent_gemma", "RecurrentGemmaConfig"),
102102
("rembert", "RemBertConfig"),
103+
("swin", "SwinConfig"),
103104
("siglip", "SiglipConfig"),
104105
("siglip_vision_model", "SiglipVisionConfig"),
105106
("smolvlm", "SmolVLMConfig"),
@@ -193,6 +194,7 @@
193194
("qwen2_vl", "Qwen2VL"),
194195
("recurrent_gemma", "RecurrentGemma"),
195196
("rembert", "RemBERT"),
197+
("swin", "Swin Transformer"),
196198
("siglip", "SigLIP"),
197199
("siglip_vision_model", "SiglipVisionModel"),
198200
("smolvlm", "SmolVLM"),

mindone/transformers/models/auto/modeling_auto.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,11 @@
284284
("qwen2_vl", "Qwen2VLForConditionalGeneration"),
285285
]
286286
)
287-
287+
MODEL_FOR_RETRIEVAL_MAPPING_NAMES = OrderedDict(
288+
[
289+
# ("colpali", "ColPaliForRetrieval"),
290+
]
291+
)
288292
MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = OrderedDict(
289293
[
290294
("aria", "AriaForConditionalGeneration"),
@@ -563,6 +567,7 @@
563567
MODEL_FOR_BACKBONE_MAPPING_NAMES = OrderedDict(
564568
[
565569
("hiera", "HieraBackbone"),
570+
("swin", "SwinBackbone"),
566571
]
567572
)
568573

@@ -649,6 +654,7 @@
649654
MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING = _LazyAutoMapping(
650655
CONFIG_MAPPING_NAMES, MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
651656
)
657+
MODEL_FOR_RETRIEVAL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_RETRIEVAL_MAPPING_NAMES)
652658
MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
653659
CONFIG_MAPPING_NAMES, MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES
654660
)
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from .modeling_swin import (
2+
SwinBackbone,
3+
SwinForImageClassification,
4+
SwinForMaskedImageModeling,
5+
SwinModel,
6+
SwinPreTrainedModel,
7+
)

0 commit comments

Comments
 (0)