Skip to content

Commit ec5c7f6

Browse files
Fix media downloading from hf (#4788)
1 parent eb18c64 commit ec5c7f6

File tree

3 files changed

+52
-28
lines changed

3 files changed

+52
-28
lines changed

swift/llm/dataset/dataset/mllm.py

Lines changed: 42 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from datasets import IterableDataset as HfIterableDataset
99
from tqdm import tqdm
1010

11+
from swift.utils import get_hf_endpoint, use_hf_hub
1112
from ..media import MediaResource
1213
from ..preprocessor import GroundingMixin, MessagesPreprocessor, ResponsePreprocessor, RowPreprocessor
1314
from ..register import DatasetMeta, SubsetDataset, register_dataset
@@ -27,8 +28,11 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
2728
return row
2829

2930
def prepare_dataset(self, dataset):
30-
url = ('https://www.modelscope.cn/api/v1/datasets/AI-ModelScope/ShareGPT-4o/repo?'
31-
'Revision=master&FilePath=images.zip')
31+
if not use_hf_hub():
32+
url = ('https://www.modelscope.cn/api/v1/datasets/AI-ModelScope/ShareGPT-4o/repo?'
33+
'Revision=master&FilePath=images.zip')
34+
else:
35+
url = f'{get_hf_endpoint()}/datasets/OpenGVLab/ShareGPT-4o/blob/main/images.zip'
3236
local_dir = MediaResource.download(url, 'sharegpt_4o_images')
3337
self.prefix_path = os.path.join(local_dir, 'mnt', 'petrelfs', 'wangwenhai', 'workspace_cef', '4o', 'image')
3438
return super().prepare_dataset(dataset)
@@ -186,8 +190,12 @@ def __init__(self, *, subset: str, columns: Optional[Dict[str, str]] = None) ->
186190
super().__init__(columns=columns)
187191

188192
def prepare_dataset(self, dataset: HfDataset) -> HfDataset:
189-
url = (f'https://www.modelscope.cn/api/v1/datasets/swift/Mantis-Instruct/repo?Revision='
190-
f'master&FilePath={self.subset}/train_images.zip') # noqa
193+
if not use_hf_hub():
194+
url = (f'https://www.modelscope.cn/api/v1/datasets/swift/Mantis-Instruct/repo?Revision='
195+
f'master&FilePath={self.subset}/train_images.zip') # noqa
196+
else:
197+
url = (f'{get_hf_endpoint()}/datasets/TIGER-Lab/Mantis-Instruct/'
198+
f'resolve/main/{self.subset}/train_images.zip')
191199
self.local_dir = MediaResource.download(url, f'mantis_{self.subset}')
192200
return super().prepare_dataset(dataset)
193201

@@ -324,7 +332,10 @@ class EmoSchemaPreprocessor(ResponsePreprocessor):
324332

325333
def prepare_dataset(self, dataset: HfDataset) -> HfDataset:
326334
for i in range(1, 6):
327-
url = f'https://modelscope.cn/datasets/AI-ModelScope/egoschema/resolve/master/videos_chunked_0{i}.zip'
335+
if not use_hf_hub():
336+
url = f'https://modelscope.cn/datasets/AI-ModelScope/egoschema/resolve/master/videos_chunked_0{i}.zip'
337+
else:
338+
url = f'{get_hf_endpoint()}/datasets/lmms-lab/egoschema/resolve/main/videos_chunked_0{i}.zip'
328339
local_dir = MediaResource.download(url, 'egoschema')
329340

330341
self.local_dir = os.path.join(local_dir, 'videos')
@@ -388,53 +399,49 @@ def __init__(self, *, subset: str, columns: Optional[Dict[str, str]] = None) ->
388399
self.subset = subset
389400
super().__init__(columns=columns)
390401

402+
url_prefix = 'https://www.modelscope.cn/datasets/lmms-lab/LLaVA-Video-178K/resolve/master/'
403+
if use_hf_hub():
404+
url_prefix = f'{get_hf_endpoint()}/datasets/lmms-lab/LLaVA-Video-178K/resolve/main/'
405+
391406
video_resources = {
392407
'0_30_s_academic_v0_1':
393408
_generate_url_list(
394-
'https://www.modelscope.cn/datasets/lmms-lab/LLaVA-Video-178K/resolve/master/'
395-
'0_30_s_academic_v0_1/0_30_s_academic_v0_1_videos_{}.tar.gz',
409+
url_prefix + '0_30_s_academic_v0_1/0_30_s_academic_v0_1_videos_{}.tar.gz',
396410
8,
397411
),
398412
'0_30_s_youtube_v0_1':
399413
_generate_url_list(
400-
'https://www.modelscope.cn/datasets/lmms-lab/LLaVA-Video-178K/resolve/master/'
401-
'0_30_s_youtube_v0_1/0_30_s_youtube_v0_1_videos_{}.tar.gz',
414+
url_prefix + '0_30_s_youtube_v0_1/0_30_s_youtube_v0_1_videos_{}.tar.gz',
402415
19,
403416
),
404417
'1_2_m_academic_v0_1':
405418
_generate_url_list(
406-
'https://www.modelscope.cn/datasets/lmms-lab/LLaVA-Video-178K/resolve/master/'
407-
'1_2_m_academic_v0_1/1_2_m_academic_v0_1_videos_{}.tar.gz',
419+
url_prefix + '1_2_m_academic_v0_1/1_2_m_academic_v0_1_videos_{}.tar.gz',
408420
14,
409421
),
410422
'1_2_m_youtube_v0_1':
411423
_generate_url_list(
412-
'https://www.modelscope.cn/datasets/lmms-lab/LLaVA-Video-178K/resolve/master/'
413-
'1_2_m_youtube_v0_1/1_2_m_youtube_v0_1_videos_{}.tar.gz',
424+
url_prefix + '1_2_m_youtube_v0_1/1_2_m_youtube_v0_1_videos_{}.tar.gz',
414425
50,
415426
),
416427
'2_3_m_academic_v0_1':
417428
_generate_url_list(
418-
'https://www.modelscope.cn/datasets/lmms-lab/LLaVA-Video-178K/resolve/master/'
419-
'2_3_m_academic_v0_1/2_3_m_academic_v0_1_videos_{}.tar.gz',
429+
url_prefix + '2_3_m_academic_v0_1/2_3_m_academic_v0_1_videos_{}.tar.gz',
420430
18,
421431
),
422432
'2_3_m_youtube_v0_1':
423433
_generate_url_list(
424-
'https://www.modelscope.cn/datasets/lmms-lab/LLaVA-Video-178K/resolve/master/'
425-
'2_3_m_youtube_v0_1/2_3_m_youtube_v0_1_videos_{}.tar.gz',
434+
url_prefix + '2_3_m_youtube_v0_1/2_3_m_youtube_v0_1_videos_{}.tar.gz',
426435
98,
427436
),
428437
'30_60_s_academic_v0_1':
429438
_generate_url_list(
430-
'https://www.modelscope.cn/datasets/lmms-lab/LLaVA-Video-178K/resolve/master/'
431-
'30_60_s_academic_v0_1/30_60_s_academic_v0_1_videos_{}.tar.gz',
439+
url_prefix + '30_60_s_academic_v0_1/30_60_s_academic_v0_1_videos_{}.tar.gz',
432440
10,
433441
),
434442
'30_60_s_youtube_v0_1':
435443
_generate_url_list(
436-
'https://www.modelscope.cn/datasets/lmms-lab/LLaVA-Video-178K/resolve/master/'
437-
'30_60_s_youtube_v0_1/30_60_s_youtube_v0_1_videos_{}.tar.gz',
444+
url_prefix + '30_60_s_youtube_v0_1/30_60_s_youtube_v0_1_videos_{}.tar.gz',
438445
13,
439446
),
440447
}
@@ -495,7 +502,10 @@ def prepare_dataset(self, dataset: HfDataset) -> HfDataset:
495502
[f'TFS-{i}.mp4' for i in range(1, 13)] + \
496503
[f'UWA-{i}.mp4' for i in range(1, 5)] + ['UWA-6.mp4']
497504
for file in mp4_set:
498-
url = f'https://modelscope.cn/datasets/AI-ModelScope/MovieChat-1K-test/resolve/master/videos/{file}'
505+
if not use_hf_hub():
506+
url = f'https://modelscope.cn/datasets/AI-ModelScope/MovieChat-1K-test/resolve/master/videos/{file}'
507+
else:
508+
url = f'{get_hf_endpoint()}/datasets/Enxin/MovieChat-1K-test/resolve/main/videos/{file}'
499509
self.local_dir = MediaResource.download(url, 'moviechat_1k_test', file_type='file')
500510
return super().prepare_dataset(dataset)
501511

@@ -522,7 +532,10 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
522532
class VideoChatGPTPreprocessor(ResponsePreprocessor):
523533

524534
def prepare_dataset(self, dataset: HfDataset) -> HfDataset:
525-
url = 'https://modelscope.cn/datasets/swift/VideoChatGPT/resolve/master/videos.zip'
535+
if not use_hf_hub():
536+
url = 'https://modelscope.cn/datasets/swift/VideoChatGPT/resolve/master/videos.zip'
537+
else:
538+
url = f'{get_hf_endpoint()}/datasets/lmms-lab/VideoChatGPT/resolve/main/videos.zip'
526539
local_dir = MediaResource.download(url, 'video_chatgpt')
527540
self.local_dir = os.path.join(local_dir, 'Test_Videos')
528541
return super().prepare_dataset(dataset)
@@ -894,9 +907,13 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
894907
class LLaVAPretrainPreprocessor(MessagesPreprocessor):
895908

896909
def prepare_dataset(self, dataset):
910+
if not use_hf_hub():
911+
url = ('https://www.modelscope.cn/api/v1/datasets/AI-ModelScope/LLaVA-Pretrain/repo?'
912+
'Revision=master&FilePath=images.zip')
913+
else:
914+
url = f'{get_hf_endpoint()}/datasets/liuhaotian/LLaVA-Pretrain/resolve/main/images.zip'
897915
self.media_dir = MediaResource.download(
898-
('https://www.modelscope.cn/api/v1/datasets/AI-ModelScope/LLaVA-Pretrain/repo?'
899-
'Revision=master&FilePath=images.zip'),
916+
url,
900917
# noqa
901918
'llava_pretrain')
902919
return super().prepare_dataset(dataset)

swift/utils/__init__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
# Copyright (c) Alibaba, Inc. and its affiliates.
22

3-
from .env import (get_dist_setting, get_node_setting, get_pai_tensorboard_dir, is_deepspeed_enabled, is_dist,
4-
is_dist_ta, is_local_master, is_master, is_mp, is_mp_ddp, is_pai_training_job, torchacc_trim_graph,
5-
use_hf_hub, use_torchacc)
3+
from .env import (get_dist_setting, get_hf_endpoint, get_node_setting, get_pai_tensorboard_dir, is_deepspeed_enabled,
4+
is_dist, is_dist_ta, is_local_master, is_master, is_mp, is_mp_ddp, is_pai_training_job,
5+
torchacc_trim_graph, use_hf_hub, use_torchacc)
66
from .import_utils import (is_liger_available, is_lmdeploy_available, is_megatron_available, is_swanlab_available,
77
is_unsloth_available, is_vllm_ascend_available, is_vllm_available, is_wandb_available,
88
is_xtuner_available)

swift/utils/env.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,13 @@ def use_hf_hub():
1515
return strtobool(os.environ.get('USE_HF', '0'))
1616

1717

18+
def get_hf_endpoint():
19+
hf_endpoint = os.environ.get('HF_ENDPOINT', 'https://huggingface.co/')
20+
if hf_endpoint.endswith('/'):
21+
hf_endpoint = hf_endpoint[:-1]
22+
return hf_endpoint
23+
24+
1825
def is_deepspeed_enabled():
1926
return strtobool(os.environ.get('ACCELERATE_USE_DEEPSPEED', '0'))
2027

0 commit comments

Comments
 (0)