Skip to content

Commit c8e9ff9

Browse files
committed
Defer multimodal imports
Signed-off-by: Samuel Monson <[email protected]>
1 parent c1340b4 commit c8e9ff9

File tree

7 files changed

+552
-539
lines changed

7 files changed

+552
-539
lines changed

pylock.toml

Lines changed: 72 additions & 97 deletions
Large diffs are not rendered by default.

pyproject.toml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,6 @@ dependencies = [
6363
"loguru",
6464
"msgpack",
6565
"numpy>=2.0.0",
66-
"pillow",
6766
"protobuf",
6867
"pydantic>=2.11.7",
6968
"pydantic-settings>=2.0.0",
@@ -82,7 +81,12 @@ recommended = ["guidellm[perf,openai]"]
8281
# Feature Extras
8382
perf = ["orjson", "msgpack", "msgspec", "uvloop"]
8483
openai = ["tiktoken>=0.11.0", "blobfile>=3.1.0"]
85-
multimodal = ["datasets[audio,vision]>=4.1.0", "torchcodec==0.7", "torch==2.8.*"]
84+
multimodal = [
85+
"datasets[audio,vision]>=4.1.0",
86+
"pillow",
87+
"torch==2.8.*",
88+
"torchcodec==0.7",
89+
]
8690
# Dev Tooling
8791
dev = [
8892
# Install all optional dependencies

src/guidellm/data/preprocessors/formatters.py

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
from __future__ import annotations
22

3+
from abc import ABCMeta
34
from typing import Any
45

56
from guidellm.data.preprocessors.preprocessor import (
67
DatasetPreprocessor,
78
PreprocessorRegistry,
89
)
910
from guidellm.data.schemas import GenerativeDatasetColumnType
10-
from guidellm.data.utils import encode_audio, encode_image, encode_video, text_stats
11+
from guidellm.data.utils import text_stats
1112
from guidellm.schemas import GenerationRequest, GenerationRequestArguments, UsageMetrics
1213

1314
__all__ = [
@@ -18,8 +19,28 @@
1819
]
1920

2021

22+
class RequestFormatter(DatasetPreprocessor, metaclass=ABCMeta):
23+
@staticmethod
24+
def encode_audio(*args, **kwargs):
25+
from guidellm.extras.multimodal import encode_audio
26+
27+
return encode_audio(*args, **kwargs)
28+
29+
@staticmethod
30+
def encode_image(*args, **kwargs):
31+
from guidellm.extras.multimodal import encode_image
32+
33+
return encode_image(*args, **kwargs)
34+
35+
@staticmethod
36+
def encode_video(*args, **kwargs):
37+
from guidellm.extras.multimodal import encode_video
38+
39+
return encode_video(*args, **kwargs)
40+
41+
2142
@PreprocessorRegistry.register("text_completions")
22-
class GenerativeTextCompletionsRequestFormatter(DatasetPreprocessor):
43+
class GenerativeTextCompletionsRequestFormatter(RequestFormatter):
2344
def __init__(
2445
self,
2546
model: str,
@@ -92,7 +113,7 @@ def __call__(
92113

93114

94115
@PreprocessorRegistry.register("chat_completions")
95-
class GenerativeChatCompletionsRequestFormatter(DatasetPreprocessor):
116+
class GenerativeChatCompletionsRequestFormatter(RequestFormatter):
96117
def __init__(
97118
self,
98119
model: str,
@@ -120,7 +141,7 @@ def __init__(
120141
encode_kwargs.get("audio", {}) if encode_kwargs else {}
121142
)
122143

123-
def __call__(
144+
def __call__( # noqa: C901, PLR0912, PLR0915
124145
self, columns: dict[GenerativeDatasetColumnType, list[Any]]
125146
) -> GenerationRequest:
126147
arguments = GenerationRequestArguments(body={})
@@ -200,7 +221,7 @@ def __call__(
200221
if not image:
201222
continue
202223

203-
image_dict = encode_image(image, **self.encode_image_kwargs)
224+
image_dict = self.encode_image(image, **self.encode_image_kwargs)
204225
if (image_pixels := image_dict.get("image_pixels")) is not None:
205226
input_metrics.image_pixels = (
206227
input_metrics.image_pixels or 0
@@ -223,7 +244,7 @@ def __call__(
223244
if not video:
224245
continue
225246

226-
video_dict = encode_video(video, **self.encode_video_kwargs)
247+
video_dict = self.encode_video(video, **self.encode_video_kwargs)
227248
if (video_frames := video_dict.get("video_frames")) is not None:
228249
input_metrics.video_frames = (
229250
input_metrics.video_frames or 0
@@ -250,7 +271,9 @@ def __call__(
250271
if not audio:
251272
continue
252273

253-
audio_dict = encode_audio(audio, b64encode=True, **self.encode_audio_kwargs)
274+
audio_dict = self.encode_audio(
275+
audio, b64encode=True, **self.encode_audio_kwargs
276+
)
254277
if (audio_samples := audio_dict.get("audio_samples")) is not None:
255278
input_metrics.audio_samples = (
256279
input_metrics.audio_samples or 0
@@ -288,7 +311,7 @@ def __call__(
288311

289312

290313
@PreprocessorRegistry.register("audio_transcriptions")
291-
class GenerativeAudioTranscriptionRequestFormatter(DatasetPreprocessor):
314+
class GenerativeAudioTranscriptionRequestFormatter(RequestFormatter):
292315
def __init__(
293316
self,
294317
model: str,
@@ -345,7 +368,7 @@ def __call__( # noqa: C901
345368
f"one audio column, but got {len(audio_columns)}."
346369
)
347370

348-
audio_dict = encode_audio(
371+
audio_dict = self.encode_audio(
349372
audio_columns[0], b64encode=False, **self.encode_audio_kwargs
350373
)
351374
input_metrics.audio_samples = audio_dict.get("audio_samples")
Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,10 @@
11
from .dataset import DEFAULT_SPLITS, resolve_dataset_split
22
from .functions import (
3-
encode_audio,
4-
encode_image,
5-
encode_video,
6-
get_file_format,
7-
is_url,
8-
resize_image,
93
text_stats,
104
)
115

126
__all__ = [
137
"DEFAULT_SPLITS",
14-
"encode_audio",
15-
"encode_image",
16-
"encode_video",
17-
"get_file_format",
18-
"is_url",
19-
"resize_image",
208
"resolve_dataset_split",
219
"text_stats",
2210
]

0 commit comments

Comments
 (0)