Skip to content
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 26 additions & 35 deletions src/whisper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6652,8 +6652,8 @@ static bool whisper_vad(

if (vad_segments->data.size() > 0) {
state->has_vad_segments = true;
ctx->state->vad_segments.clear();
ctx->state->vad_segments.reserve(vad_segments->data.size());
state->vad_segments.clear();
state->vad_segments.reserve(vad_segments->data.size());

// Initialize the time mapping table
state->vad_mapping_table.clear();
Expand Down Expand Up @@ -6749,7 +6749,7 @@ static bool whisper_vad(

WHISPER_LOG_INFO("%s: vad_segment_info: orig_start: %.2f, orig_end: %.2f, vad_start: %.2f, vad_end: %.2f\n",
__func__, segment.orig_start/100.0, segment.orig_end/100.0, segment.vad_start/100.0, segment.vad_end/100.0);
ctx->state->vad_segments.push_back(segment);
state->vad_segments.push_back(segment);

// Copy this speech segment
memcpy(filtered_samples.data() + offset, samples + segment_start_samples, segment_length * sizeof(float));
Expand Down Expand Up @@ -6820,6 +6820,24 @@ int whisper_full_with_state(
}
}

std::vector<float> vad_samples;
if (params.vad)
{
WHISPER_LOG_INFO("%s: VAD is enabled, processing speech segments only\n", __func__);
if (!whisper_vad(ctx, state, params, samples, n_samples, vad_samples))
{
WHISPER_LOG_ERROR("%s: failed to compute VAD\n", __func__);
return -1;
}
if (vad_samples.empty())
{
state->result_all.clear();
return 0;
}
samples = vad_samples.data();
n_samples = vad_samples.size();
}

// auto-detect language if not specified
if (params.language == nullptr || strlen(params.language) == 0 || strcmp(params.language, "auto") == 0 || params.detect_language) {
std::vector<float> probs(whisper_lang_max_id() + 1, 0.0f);
Expand Down Expand Up @@ -7720,25 +7738,11 @@ int whisper_full_with_state(
}

int whisper_full(
struct whisper_context * ctx,
struct whisper_full_params params,
const float * samples,
int n_samples) {

std::vector<float> vad_samples;
if (params.vad) {
WHISPER_LOG_INFO("%s: VAD is enabled, processing speech segments only\n", __func__);
if (!whisper_vad(ctx, ctx->state, params, samples, n_samples, vad_samples)) {
WHISPER_LOG_ERROR("%s: failed to compute VAD\n", __func__);
return -1;
}
if (vad_samples.empty()) {
ctx->state->result_all.clear();
return 0;
}
samples = vad_samples.data();
n_samples = vad_samples.size();
}
struct whisper_context *ctx,
struct whisper_full_params params,
const float *samples,
int n_samples)
{
return whisper_full_with_state(ctx, ctx->state, params, samples, n_samples);
}

Expand All @@ -7753,19 +7757,6 @@ int whisper_full_parallel(
return whisper_full(ctx, params, samples, n_samples);
}

std::vector<float> vad_samples;
if (params.vad) {
WHISPER_LOG_INFO("%s: VAD is enabled, processing speech segments only\n", __func__);
if (!whisper_vad(ctx, ctx->state, params, samples, n_samples, vad_samples)) {
WHISPER_LOG_ERROR("%s: failed to compute VAD\n", __func__);
return -1;
}
if (vad_samples.empty()) {
return 0;
}
samples = vad_samples.data();
n_samples = vad_samples.size();
}
int ret = 0;

// prepare separate states for each thread
Expand Down
Loading