8
8
from datasets import IterableDataset as HfIterableDataset
9
9
from tqdm import tqdm
10
10
11
+ from swift .utils import get_hf_endpoint , use_hf_hub
11
12
from ..media import MediaResource
12
13
from ..preprocessor import GroundingMixin , MessagesPreprocessor , ResponsePreprocessor , RowPreprocessor
13
14
from ..register import DatasetMeta , SubsetDataset , register_dataset
@@ -27,8 +28,11 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
27
28
return row
28
29
29
30
def prepare_dataset (self , dataset ):
30
- url = ('https://www.modelscope.cn/api/v1/datasets/AI-ModelScope/ShareGPT-4o/repo?'
31
- 'Revision=master&FilePath=images.zip' )
31
+ if not use_hf_hub ():
32
+ url = ('https://www.modelscope.cn/api/v1/datasets/AI-ModelScope/ShareGPT-4o/repo?'
33
+ 'Revision=master&FilePath=images.zip' )
34
+ else :
35
+ url = f'{ get_hf_endpoint ()} /datasets/OpenGVLab/ShareGPT-4o/blob/main/images.zip'
32
36
local_dir = MediaResource .download (url , 'sharegpt_4o_images' )
33
37
self .prefix_path = os .path .join (local_dir , 'mnt' , 'petrelfs' , 'wangwenhai' , 'workspace_cef' , '4o' , 'image' )
34
38
return super ().prepare_dataset (dataset )
@@ -186,8 +190,12 @@ def __init__(self, *, subset: str, columns: Optional[Dict[str, str]] = None) ->
186
190
super ().__init__ (columns = columns )
187
191
188
192
def prepare_dataset (self , dataset : HfDataset ) -> HfDataset :
189
- url = (f'https://www.modelscope.cn/api/v1/datasets/swift/Mantis-Instruct/repo?Revision='
190
- f'master&FilePath={ self .subset } /train_images.zip' ) # noqa
193
+ if not use_hf_hub ():
194
+ url = (f'https://www.modelscope.cn/api/v1/datasets/swift/Mantis-Instruct/repo?Revision='
195
+ f'master&FilePath={ self .subset } /train_images.zip' ) # noqa
196
+ else :
197
+ url = (f'{ get_hf_endpoint ()} /datasets/TIGER-Lab/Mantis-Instruct/'
198
+ f'resolve/main/{ self .subset } /train_images.zip' )
191
199
self .local_dir = MediaResource .download (url , f'mantis_{ self .subset } ' )
192
200
return super ().prepare_dataset (dataset )
193
201
@@ -324,7 +332,10 @@ class EmoSchemaPreprocessor(ResponsePreprocessor):
324
332
325
333
def prepare_dataset (self , dataset : HfDataset ) -> HfDataset :
326
334
for i in range (1 , 6 ):
327
- url = f'https://modelscope.cn/datasets/AI-ModelScope/egoschema/resolve/master/videos_chunked_0{ i } .zip'
335
+ if not use_hf_hub ():
336
+ url = f'https://modelscope.cn/datasets/AI-ModelScope/egoschema/resolve/master/videos_chunked_0{ i } .zip'
337
+ else :
338
+ url = f'{ get_hf_endpoint ()} /datasets/lmms-lab/egoschema/resolve/main/videos_chunked_0{ i } .zip'
328
339
local_dir = MediaResource .download (url , 'egoschema' )
329
340
330
341
self .local_dir = os .path .join (local_dir , 'videos' )
@@ -388,53 +399,49 @@ def __init__(self, *, subset: str, columns: Optional[Dict[str, str]] = None) ->
388
399
self .subset = subset
389
400
super ().__init__ (columns = columns )
390
401
402
+ url_prefix = 'https://www.modelscope.cn/datasets/lmms-lab/LLaVA-Video-178K/resolve/master/'
403
+ if use_hf_hub ():
404
+ url_prefix = f'{ get_hf_endpoint ()} /datasets/lmms-lab/LLaVA-Video-178K/resolve/main/'
405
+
391
406
video_resources = {
392
407
'0_30_s_academic_v0_1' :
393
408
_generate_url_list (
394
- 'https://www.modelscope.cn/datasets/lmms-lab/LLaVA-Video-178K/resolve/master/'
395
- '0_30_s_academic_v0_1/0_30_s_academic_v0_1_videos_{}.tar.gz' ,
409
+ url_prefix + '0_30_s_academic_v0_1/0_30_s_academic_v0_1_videos_{}.tar.gz' ,
396
410
8 ,
397
411
),
398
412
'0_30_s_youtube_v0_1' :
399
413
_generate_url_list (
400
- 'https://www.modelscope.cn/datasets/lmms-lab/LLaVA-Video-178K/resolve/master/'
401
- '0_30_s_youtube_v0_1/0_30_s_youtube_v0_1_videos_{}.tar.gz' ,
414
+ url_prefix + '0_30_s_youtube_v0_1/0_30_s_youtube_v0_1_videos_{}.tar.gz' ,
402
415
19 ,
403
416
),
404
417
'1_2_m_academic_v0_1' :
405
418
_generate_url_list (
406
- 'https://www.modelscope.cn/datasets/lmms-lab/LLaVA-Video-178K/resolve/master/'
407
- '1_2_m_academic_v0_1/1_2_m_academic_v0_1_videos_{}.tar.gz' ,
419
+ url_prefix + '1_2_m_academic_v0_1/1_2_m_academic_v0_1_videos_{}.tar.gz' ,
408
420
14 ,
409
421
),
410
422
'1_2_m_youtube_v0_1' :
411
423
_generate_url_list (
412
- 'https://www.modelscope.cn/datasets/lmms-lab/LLaVA-Video-178K/resolve/master/'
413
- '1_2_m_youtube_v0_1/1_2_m_youtube_v0_1_videos_{}.tar.gz' ,
424
+ url_prefix + '1_2_m_youtube_v0_1/1_2_m_youtube_v0_1_videos_{}.tar.gz' ,
414
425
50 ,
415
426
),
416
427
'2_3_m_academic_v0_1' :
417
428
_generate_url_list (
418
- 'https://www.modelscope.cn/datasets/lmms-lab/LLaVA-Video-178K/resolve/master/'
419
- '2_3_m_academic_v0_1/2_3_m_academic_v0_1_videos_{}.tar.gz' ,
429
+ url_prefix + '2_3_m_academic_v0_1/2_3_m_academic_v0_1_videos_{}.tar.gz' ,
420
430
18 ,
421
431
),
422
432
'2_3_m_youtube_v0_1' :
423
433
_generate_url_list (
424
- 'https://www.modelscope.cn/datasets/lmms-lab/LLaVA-Video-178K/resolve/master/'
425
- '2_3_m_youtube_v0_1/2_3_m_youtube_v0_1_videos_{}.tar.gz' ,
434
+ url_prefix + '2_3_m_youtube_v0_1/2_3_m_youtube_v0_1_videos_{}.tar.gz' ,
426
435
98 ,
427
436
),
428
437
'30_60_s_academic_v0_1' :
429
438
_generate_url_list (
430
- 'https://www.modelscope.cn/datasets/lmms-lab/LLaVA-Video-178K/resolve/master/'
431
- '30_60_s_academic_v0_1/30_60_s_academic_v0_1_videos_{}.tar.gz' ,
439
+ url_prefix + '30_60_s_academic_v0_1/30_60_s_academic_v0_1_videos_{}.tar.gz' ,
432
440
10 ,
433
441
),
434
442
'30_60_s_youtube_v0_1' :
435
443
_generate_url_list (
436
- 'https://www.modelscope.cn/datasets/lmms-lab/LLaVA-Video-178K/resolve/master/'
437
- '30_60_s_youtube_v0_1/30_60_s_youtube_v0_1_videos_{}.tar.gz' ,
444
+ url_prefix + '30_60_s_youtube_v0_1/30_60_s_youtube_v0_1_videos_{}.tar.gz' ,
438
445
13 ,
439
446
),
440
447
}
@@ -495,7 +502,10 @@ def prepare_dataset(self, dataset: HfDataset) -> HfDataset:
495
502
[f'TFS-{ i } .mp4' for i in range (1 , 13 )] + \
496
503
[f'UWA-{ i } .mp4' for i in range (1 , 5 )] + ['UWA-6.mp4' ]
497
504
for file in mp4_set :
498
- url = f'https://modelscope.cn/datasets/AI-ModelScope/MovieChat-1K-test/resolve/master/videos/{ file } '
505
+ if not use_hf_hub ():
506
+ url = f'https://modelscope.cn/datasets/AI-ModelScope/MovieChat-1K-test/resolve/master/videos/{ file } '
507
+ else :
508
+ url = f'{ get_hf_endpoint ()} /datasets/Enxin/MovieChat-1K-test/resolve/main/videos/{ file } '
499
509
self .local_dir = MediaResource .download (url , 'moviechat_1k_test' , file_type = 'file' )
500
510
return super ().prepare_dataset (dataset )
501
511
@@ -522,7 +532,10 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
522
532
class VideoChatGPTPreprocessor (ResponsePreprocessor ):
523
533
524
534
def prepare_dataset (self , dataset : HfDataset ) -> HfDataset :
525
- url = 'https://modelscope.cn/datasets/swift/VideoChatGPT/resolve/master/videos.zip'
535
+ if not use_hf_hub ():
536
+ url = 'https://modelscope.cn/datasets/swift/VideoChatGPT/resolve/master/videos.zip'
537
+ else :
538
+ url = f'{ get_hf_endpoint ()} /datasets/lmms-lab/VideoChatGPT/resolve/main/videos.zip'
526
539
local_dir = MediaResource .download (url , 'video_chatgpt' )
527
540
self .local_dir = os .path .join (local_dir , 'Test_Videos' )
528
541
return super ().prepare_dataset (dataset )
@@ -894,9 +907,13 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
894
907
class LLaVAPretrainPreprocessor (MessagesPreprocessor ):
895
908
896
909
def prepare_dataset (self , dataset ):
910
+ if not use_hf_hub ():
911
+ url = ('https://www.modelscope.cn/api/v1/datasets/AI-ModelScope/LLaVA-Pretrain/repo?'
912
+ 'Revision=master&FilePath=images.zip' )
913
+ else :
914
+ url = f'{ get_hf_endpoint ()} /datasets/liuhaotian/LLaVA-Pretrain/resolve/main/images.zip'
897
915
self .media_dir = MediaResource .download (
898
- ('https://www.modelscope.cn/api/v1/datasets/AI-ModelScope/LLaVA-Pretrain/repo?'
899
- 'Revision=master&FilePath=images.zip' ),
916
+ url ,
900
917
# noqa
901
918
'llava_pretrain' )
902
919
return super ().prepare_dataset (dataset )
0 commit comments