Skip to content

Commit a12f0f4

Browse files
fix lazy load (#328)
(cherry picked from commit 326d9a8b3fab1b4da35b6c04bd7c27dbbe9cf14c)
1 parent 0ea9d24 commit a12f0f4

File tree

2 files changed

+9
-14
lines changed

2 files changed

+9
-14
lines changed

swift/llm/utils/dataset.py

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1121,20 +1121,7 @@ def load_dataset_from_local(
11211121
dataset = HfDataset.from_dict(df.to_dict(orient='list'))
11221122
dataset_list.append(preprocess_func(dataset))
11231123

1124-
dataset = concatenate_datasets(dataset_list)
1125-
1126-
def load_image(row):
1127-
from PIL import Image
1128-
import requests
1129-
if not os.path.exists(row['image']):
1130-
row['image'] = requests.get(row['image'], stream=True).raw
1131-
row['image'] = Image.open(row['image'])
1132-
return row
1133-
1134-
if 'image' in dataset.features and isinstance(dataset[0]['image'], str):
1135-
dataset = HfDataset.from_list(
1136-
dataset_map(dataset, load_image, num_proc=4).data)
1137-
return dataset
1124+
return concatenate_datasets(dataset_list)
11381125

11391126

11401127
def get_custom_dataset(_: str, train_subset_split_list: Union[str, List[str]],

swift/llm/utils/template.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# Copyright (c) Alibaba, Inc. and its affiliates.
2+
import os
23
from copy import deepcopy
34
from typing import Any, Dict, List, Literal, Optional, Tuple, Union
45

@@ -554,6 +555,13 @@ def build_conversation_input_ids(
554555

555556
def encode(self, example: Dict[str,
556557
Any]) -> Dict[str, Optional[List[int]]]:
558+
if 'image' in example and isinstance(example['image'], str):
559+
from PIL import Image
560+
import requests
561+
if not os.path.exists(example['image']):
562+
example['image'] = requests.get(
563+
example['image'], stream=True).raw
564+
example['image'] = Image.open(example['image'])
557565
return self.build_conversation_input_ids(
558566
self.tokenizer,
559567
query=example['query'],

0 commit comments

Comments
 (0)