Skip to content

Commit 36d450e

Browse files
nemoramomayufeng
andauthored
Adds FunAudioChat multimodal audio model support (#2) (#33058)
Signed-off-by: ramos <49182011+nemoramo@users.noreply.github.com> Signed-off-by: mayufeng <mayufeng@example.com> Co-authored-by: mayufeng <mayufeng@example.com>
1 parent a2b877d commit 36d450e

File tree

8 files changed

+1264
-0
lines changed

8 files changed

+1264
-0
lines changed

examples/offline_inference/audio_language.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,31 @@ def run_glmasr(question: str, audio_count: int) -> ModelRequestData:
117117
)
118118

119119

120+
# FunAudioChat
121+
def run_funaudiochat(question: str, audio_count: int) -> ModelRequestData:
122+
# NOTE: FunAudioChat is not available on the HuggingFace Hub at the time of
123+
# writing. Pass a local model path via `--model`.
124+
model_name = "funaudiochat"
125+
126+
engine_args = EngineArgs(
127+
model=model_name,
128+
max_model_len=4096,
129+
max_num_seqs=2,
130+
limit_mm_per_prompt={"audio": audio_count},
131+
enforce_eager=True,
132+
)
133+
134+
audio_in_prompt = "".join(
135+
["<|audio_bos|><|AUDIO|><|audio_eos|>\n" for _ in range(audio_count)]
136+
)
137+
prompt = f"{audio_in_prompt}{question}"
138+
139+
return ModelRequestData(
140+
engine_args=engine_args,
141+
prompt=prompt,
142+
)
143+
144+
120145
# Granite Speech
121146
def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
122147
# NOTE - the setting in this example are somewhat different from what is
@@ -410,6 +435,7 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:
410435
"audioflamingo3": run_audioflamingo3,
411436
"gemma3n": run_gemma3n,
412437
"glmasr": run_glmasr,
438+
"funaudiochat": run_funaudiochat,
413439
"granite_speech": run_granite_speech,
414440
"midashenglm": run_midashenglm,
415441
"minicpmo": run_minicpmo,
@@ -435,6 +461,12 @@ def parse_args():
435461
choices=model_example_map.keys(),
436462
help='Huggingface "model_type".',
437463
)
464+
parser.add_argument(
465+
"--model",
466+
type=str,
467+
default=None,
468+
help="Model ID or local path override. Required for funaudiochat.",
469+
)
438470
parser.add_argument(
439471
"--num-prompts", type=int, default=1, help="Number of prompts to run."
440472
)
@@ -467,6 +499,9 @@ def main(args):
467499
if model not in model_example_map:
468500
raise ValueError(f"Model type {model} is not supported.")
469501

502+
if model == "funaudiochat" and not args.model:
503+
raise ValueError("--model is required when --model-type=funaudiochat")
504+
470505
if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1:
471506
raise ValueError(
472507
f"tensor_parallel_size must be a positive integer, "
@@ -477,6 +512,8 @@ def main(args):
477512
req_data = model_example_map[model](
478513
question_per_audio_count[audio_count], audio_count
479514
)
515+
if model == "funaudiochat":
516+
req_data.engine_args.model = args.model
480517

481518
# Disable other modalities to save memory
482519
default_limits = {"image": 0, "video": 0, "audio": 0}

tests/models/registry.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -692,6 +692,9 @@ def check_available_online(
692692
"baidu/ERNIE-4.5-VL-28B-A3B-PT",
693693
trust_remote_code=True,
694694
),
695+
"FunAudioChatForConditionalGeneration": _HfExamplesInfo(
696+
"funaudiochat", is_available_online=False
697+
),
695698
"FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
696699
"Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it"),
697700
"Gemma3nForConditionalGeneration": _HfExamplesInfo("google/gemma-3n-E2B-it"),

0 commit comments

Comments
 (0)