Skip to content

Commit a1f9ff6

Browse files
authored
feat(transformers/pipelines): add object detection, zero shot classification, and zero shot object detection (#1193)
* feat(transformers/pipelines): add object detection, zero shot classification, and zero shot object detection * fix bugs * update to transformers v4.53 * add YOLOS and OWL-ViT models * fixes * fix iterator
1 parent 91a28f6 commit a1f9ff6

23 files changed

+6619
-64
lines changed

mindone/transformers/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -476,6 +476,15 @@
476476
OPTModel,
477477
OPTPreTrainedModel,
478478
)
479+
from .models.owlvit import (
480+
OwlViTForObjectDetection,
481+
OwlViTImageProcessor,
482+
OwlViTModel,
483+
OwlViTPreTrainedModel,
484+
OwlViTProcessor,
485+
OwlViTTextModel,
486+
OwlViTVisionModel,
487+
)
479488
from .models.paligemma import PaliGemmaForConditionalGeneration, PaliGemmaPreTrainedModel
480489
from .models.persimmon import (
481490
PersimmonForCausalLM,
@@ -638,6 +647,7 @@
638647
XLMRobertaXLModel,
639648
XLMRobertaXLPreTrainedModel,
640649
)
650+
from .models.yolos import YolosForObjectDetection, YolosImageProcessor, YolosModel, YolosPreTrainedModel
641651
from .pipelines import TextGenerationPipeline, pipeline
642652
from .processing_utils import ProcessorMixin
643653

mindone/transformers/models/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@
7171
mpt,
7272
mvp,
7373
opt,
74+
owlvit,
7475
paligemma,
7576
persimmon,
7677
phi,
@@ -93,6 +94,7 @@
9394
vits,
9495
wav2vec2,
9596
xlm_roberta,
97+
yolos,
9698
)
9799

98100
if version.parse(transformers.__version__) >= version.parse("4.51.0"):

mindone/transformers/models/auto/configuration_auto.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@
9393
("mt5", "MT5Config"),
9494
("megatron-bert", "MegatronBertConfig"),
9595
("mixtral", "MixtralConfig"),
96+
("owlvit", "OwlViTConfig"),
9697
("paligemma", "PaliGemmaConfig"),
9798
("phi", "PhiConfig"),
9899
("phi3", "Phi3Config"),
@@ -120,6 +121,7 @@
120121
("whisper", "WhisperConfig"),
121122
("xlm-roberta", "XLMRobertaConfig"),
122123
("xlm-roberta-xl", "XLMRobertaXLConfig"),
124+
("yolos", "YolosConfig"),
123125
("cohere2", "Cohere2Config"),
124126
]
125127
)
@@ -195,6 +197,7 @@
195197
("megatron-bert", "Megatron-BERT"),
196198
("mistral", "Mistral"),
197199
("mixtral", "Mixtral"),
200+
("owlvit", "OWL-ViT"),
198201
("paligemma", "PaliGemma"),
199202
("phi", "Phi"),
200203
("phi3", "Phi3"),
@@ -222,6 +225,7 @@
222225
("opt", "OPT"),
223226
("xlm-roberta", "XLM-RoBERTa"),
224227
("xlm-roberta-xl", "XLM-RoBERTa-XL"),
228+
("yolos", "YOLOS"),
225229
("cohere2", "Cohere2"),
226230
]
227231
)

mindone/transformers/models/auto/image_processing_auto.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,10 @@
6060
("llava_next", ("LlavaNextImageProcessor",)),
6161
("llava_next_video", ("LlavaNextVideoImageProcessor",)),
6262
("llava_onevision", ("LlavaOnevisionImageProcessor",)),
63+
("owlvit", ("OwlViTImageProcessor",)),
6364
("segformer", ("SegformerImageProcessor",)),
6465
("siglip", ("SiglipImageProcessor", "SiglipImageProcessorFast")),
66+
("yolos", ("YolosImageProcessor",)),
6567
]
6668
)
6769

mindone/transformers/models/auto/modeling_auto.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@
9292
("mt5", "MT5Model"),
9393
("megatron-bert", "MegatronBertModel"),
9494
("mixtral", "MixtralModel"),
95+
("owlvit", "OwlViTModel"),
9596
("phi", "PhiModel"),
9697
("phi3", "Phi3Model"),
9798
("qwen2", "Qwen2Model"),
@@ -114,6 +115,7 @@
114115
("whisper", "WhisperModel"),
115116
("xlm-roberta", "XLMRobertaModel"),
116117
("xlm-roberta-xl", "XLMRobertaXLModel"),
118+
("yolos", "YolosModel"),
117119
("cohere2", "Cohere2Model"),
118120
]
119121
)
@@ -235,6 +237,7 @@
235237
("segformer", "SegformerModel"),
236238
("siglip_vision_model", "SiglipVisionModel"),
237239
("vit", "ViTModel"),
240+
("yolos", "YolosModel"),
238241
]
239242
)
240243

mindone/transformers/models/auto/processing_auto.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
("llava_next", "LlavaNextProcessor"),
5656
("llava_next_video", "LlavaNextVideoProcessor"),
5757
("llava_onevision", "LlavaOnevisionProcessor"),
58+
("owlvit", "OwlViTProcessor"),
5859
("siglip", "SiglipProcessor"),
5960
]
6061
)
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Copyright 2024 The HuggingFace Team. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from .image_processing_owlvit import *
16+
from .modeling_owlvit import *
17+
from .processing_owlvit import *

0 commit comments

Comments
 (0)