-
Notifications
You must be signed in to change notification settings - Fork 113
use xav
instead of ffmpeg
#403
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
1b6e128
7b1c02f
fd81786
fc45ff5
d882343
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
|
@@ -136,7 +136,7 @@ defmodule Bumblebee.Audio.SpeechToTextWhisper do | |||||||
{:ok, [Nx.backend_transfer(input, Nx.BinaryBackend)]} | ||||||||
|
||||||||
{:file, path} when is_binary(path) -> | ||||||||
ffmpeg_read_as_pcm(path, sampling_rate) | ||||||||
from_file(path, sampling_rate) | ||||||||
|
||||||||
other -> | ||||||||
cond do | ||||||||
|
@@ -164,49 +164,27 @@ defmodule Bumblebee.Audio.SpeechToTextWhisper do | |||||||
end | ||||||||
end | ||||||||
|
||||||||
defp ffmpeg_read_as_pcm(path, sampling_rate) do | ||||||||
channels = 1 | ||||||||
defp from_file(path, sampling_rate) do | ||||||||
# This chunk can be of arbitrary size, the serving accumulates | ||||||||
# and overlaps chunks internally as needed. | ||||||||
|
||||||||
format = | ||||||||
case System.endianness() do | ||||||||
:little -> "f32le" | ||||||||
:big -> "f32be" | ||||||||
end | ||||||||
|
||||||||
cond do | ||||||||
System.find_executable("ffmpeg") == nil -> | ||||||||
{:error, "ffmpeg not found in PATH"} | ||||||||
|
||||||||
not File.exists?(path) -> | ||||||||
{:error, "no file found at #{path}"} | ||||||||
|
||||||||
true -> | ||||||||
# This chunk can be of arbitrary size, the serving accumulates | ||||||||
# and overlaps chunks internally as needed. We read the file | ||||||||
# as stream to reduce memory usage | ||||||||
chunk_size = 30 | ||||||||
|
||||||||
stream = | ||||||||
Stream.iterate(0, fn offset -> offset + chunk_size end) | ||||||||
|> Stream.transform({}, fn offset, acc -> | ||||||||
System.cmd( | ||||||||
"ffmpeg", | ||||||||
~w[-ss #{offset} -t #{chunk_size} -i #{path} -ac #{channels} -ar #{sampling_rate} -f #{format} -hide_banner -loglevel quiet pipe:1] | ||||||||
) | ||||||||
|> case do | ||||||||
{<<>>, 0} -> | ||||||||
{:halt, acc} | ||||||||
|
||||||||
{data, 0} -> | ||||||||
chunk = Nx.from_binary(data, :f32, backend: Nx.BinaryBackend) | ||||||||
{[chunk], acc} | ||||||||
|
||||||||
{_, 1} -> | ||||||||
raise "ffmpeg failed to decode the given file" | ||||||||
end | ||||||||
end) | ||||||||
|
||||||||
{:ok, stream} | ||||||||
if File.exists?(path) do | ||||||||
stream = | ||||||||
path | ||||||||
|> Xav.Reader.stream!( | ||||||||
read: :audio, | ||||||||
out_format: :f32, | ||||||||
out_channels: 1, | ||||||||
out_sample_rate: sampling_rate | ||||||||
) | ||||||||
|> Stream.map(fn frame -> Xav.Frame.to_nx(frame) end) | ||||||||
|> Stream.chunk_every(1000) | ||||||||
|> Stream.map(&Nx.Batch.concatenate/1) | ||||||||
|> Stream.map(fn batch -> Nx.Defn.jit_apply(&Function.identity/1, [batch]) end) | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function just needs to return a stream of chunks, so we don't need to do this concatenation.
Suggested change
Do you know what determines the length of each chunk, could that be configurable perhaps? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not completely sure but i'm guessing that a chunk/frame in a video context is the audio duration of one frame. When i remove the {:ok, whisper} = Bumblebee.load_model({:hf, "openai/whisper-tiny"})
{:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/whisper-tiny"})
{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "openai/whisper-tiny"})
{:ok, generation_config} = Bumblebee.load_generation_config({:hf, "openai/whisper-tiny"})
serving =
Bumblebee.Audio.speech_to_text_whisper(whisper, featurizer, tokenizer, generation_config,
defn_options: [compiler: EXLA]
)
# Read a couple of frames.
# See https://hexdocs.pm/bumblebee/Bumblebee.Audio.WhisperFeaturizer.html for default sampling rate.
frames =
Xav.Reader.stream!("sample.mp3", read: :audio, out_format: :f32, out_channels: 1, out_sample_rate: 16_000)
|> Stream.take(200)
|> Enum.map(fn frame -> Xav.Reader.to_nx(frame) end)
batch = Nx.Batch.concatenate(frames)
batch = Nx.Defn.jit_apply(&Function.identity/1, [batch])
Nx.Serving.run(serving, batch) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see. The serving transforms the stream to accumulate smaller chunks, but there is a place where we need to append to a list and that may be the reason why it's inefficient with tiny chunks. However, either way, I think it's wasteful to convert every frame to a tensor just to concatenate later. With the current ffmpeg code we get a single binary for the whole 30s and create a tensor from that. So ideally we want to replicate this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would this suffice? path
|> Xav.Reader.stream!(
read: :audio,
out_format: :f32,
out_channels: 1,
out_sample_rate: sampling_rate
)
|> Stream.chunk_every(1000)
|> Stream.map(fn frames ->
[frame | _] = frames
binary = Enum.reduce(frames, <<>>, fn frame, acc -> acc <> frame.data end)
Nx.with_default_backend(Nx.BinaryBackend, fn -> Nx.from_binary(binary, frame.format) end)
end) The 1000 chunks is currently arbitrary because we don't know the frame_size of the used codec. This could be added to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Correction: The information is already there. round(sampling_rate / frame.samples * 30) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Handling the frame binaries directly is a good call. I think we can transform the stream, so that we can accumulate the binary, instead of waiting for 1000 binaries and joining then. I was thinking we can determine the number of samples from binary size, but So it would be something like this: chunk_samples = sampling_rate * 30
path
|> Xav.Reader.stream!(
read: :audio,
out_format: :f32,
out_channels: 1,
out_sample_rate: sampling_rate
)
|> Stream.transform(
fn -> {<<>>, 0} end,
fn frame, {buffer, samples} ->
buffer = buffer <> frame.data
samples = samples + frame.samples
if samples >= chunk_samples do
chunk = Nx.from_binary(buffer, :f32, backend: Nx.BinaryBackend)
{[chunk], {<<>>, 0}}
else
{[], {buffer, samples}}
end
end,
fn {buffer, _samples} ->
chunk = Nx.from_binary(buffer, :f32, backend: Nx.BinaryBackend)
{[chunk], {<<>>, 0}}
end,
fn _ -> :ok end
) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wow! That is a great solution! Thanks! |
||||||||
|
||||||||
{:ok, stream} | ||||||||
else | ||||||||
{:error, "no file found at #{path}"} | ||||||||
end | ||||||||
end | ||||||||
|
||||||||
|
Uh oh!
There was an error while loading. Please reload this page.