Skip to content

Commit 437f2cb

Browse files
committed
Enhance silence detection and removal functionality in audio clips
- Added `removeSilences` method to `MediaClip` class for removing silence segments based on specified options. - Introduced `SilenceDetectionOptions` type for improved configuration of silence detection parameters. - Updated `AudioSource` class to utilize new silence detection logic, replacing the previous implementation. - Refactored silence detection utility to process audio buffers more efficiently. - Added `audio.fixtures.ts` for defining minimum sample rate constant. - Removed outdated silence detection tests from `audio.spec.ts` to streamline testing focus. These changes improve the performance and flexibility of silence handling in audio processing.
1 parent 4294a9f commit 437f2cb

File tree

7 files changed

+149
-190
lines changed

7 files changed

+149
-190
lines changed

src/clips/media/media.ts

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
*/
77

88
import { Timestamp, Transcript } from '../../models';
9-
import { AudioSource } from '../../sources';
9+
import { AudioSource, SilenceDetectionOptions } from '../../sources';
1010
import { RangeDeserializer } from './media.deserializer';
1111
import { serializable } from '../../services';
1212
import { replaceKeyframes } from '../clip/clip.utils';
@@ -313,4 +313,58 @@ export class MediaClip<Props extends MediaClipProps = MediaClipProps> extends Cl
313313
) {
314314
return this.addCaptions(strategy);
315315
}
316+
317+
/**
318+
* Remove silences from the clip
319+
*
320+
* @param options - Options for silence detection
321+
*/
322+
public async removeSilences(options: SilenceDetectionOptions = {}): Promise<MediaClip<Props>[]> {
323+
const silences = (await this.source.silences(options))
324+
.filter((silence) => inRange(silence, this.range))
325+
.sort((a, b) => a.start.millis - b.start.millis);
326+
327+
if (silences.length == 0) {
328+
return [this];
329+
}
330+
331+
const result: MediaClip<Props>[] = [this];
332+
333+
for (const silence of silences) {
334+
const item = result.at(-1);
335+
336+
if (!item) break;
337+
if (!inRange(silence, item.range)) continue;
338+
339+
if (silence.start.millis > item.range[0].millis && silence.stop.millis < item.range[1].millis) {
340+
const copy = item.copy();
341+
342+
item.range[1] = silence.start;
343+
copy.range[0] = silence.stop;
344+
345+
result.push(copy);
346+
} else if (silence.start.millis <= item.range[0].millis) {
347+
item.range[0] = silence.stop;
348+
} else if (silence.stop.millis >= item.range[1].millis) {
349+
item.range[1] = silence.start;
350+
}
351+
}
352+
353+
return result;
354+
}
355+
}
356+
357+
function inRange(
358+
silence: {
359+
start: Timestamp;
360+
stop: Timestamp;
361+
},
362+
range: [Timestamp, Timestamp],
363+
): boolean {
364+
return (
365+
(silence.start.millis >= range[0].millis &&
366+
silence.start.millis <= range[1].millis) ||
367+
(silence.stop.millis <= range[1].millis &&
368+
silence.stop.millis >= range[0].millis)
369+
)
316370
}

src/sources/audio.fixtures.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
export const MIN_SAMPLE_RATE = 3000;

src/sources/audio.spec.ts

Lines changed: 1 addition & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,7 @@
66
*/
77

88
import { describe, it, vi, beforeEach, expect } from 'vitest';
9-
import { AudioSource } from './audio'; // Import the AudioSource class
10-
import { findSilences } from './audio.utils';
11-
import { Timestamp } from '../models';
9+
import { AudioSource } from './audio';
1210

1311
// Mocking the OfflineAudioContext class
1412
class MockOfflineAudioContext {
@@ -27,30 +25,6 @@ class MockOfflineAudioContext {
2725

2826
vi.stubGlobal('OfflineAudioContext', MockOfflineAudioContext); // Stub the global OfflineAudioContext
2927

30-
describe('AudioUtils', () => {
31-
it('all silent', () => {
32-
const silences = findSilences(new Float32Array(100).fill(0), -50, 100, 100);
33-
expect(silences).toEqual([{
34-
start: new Timestamp(0),
35-
stop: new Timestamp(100),
36-
}]);
37-
});
38-
39-
it('no silences', () => {
40-
const silences = findSilences(new Float32Array(100).fill(1), -50, 100, 100);
41-
expect(silences).toEqual([]);
42-
});
43-
44-
it('find silences correctly', () => {
45-
const samples = Array.from({ length: 500 }, (_, index) => index > 300 ? (index < 400 ? 0 : 1) : -1);
46-
const silences = findSilences(new Float32Array(samples), -50, 100, 5000);
47-
expect(silences).toEqual([{
48-
start: new Timestamp(3010),
49-
stop: new Timestamp(4000),
50-
}]);
51-
});
52-
});
53-
5428
describe('AudioSource', () => {
5529
let audioSource: AudioSource;
5630

@@ -59,42 +33,6 @@ describe('AudioSource', () => {
5933
audioSource.file = new File([], 'audio.mp3', { type: 'audio/mp3' });
6034
});
6135

62-
it('find silences correctly', async () => {
63-
const audioBuffer = {
64-
duration: 16,
65-
sampleRate: 1000,
66-
length: 16000,
67-
getChannelData: () => new Float32Array(16000).fill(0), // Return a dummy Float32Array
68-
} as any as AudioBuffer;
69-
audioSource.audioBuffer = audioBuffer;
70-
const silences = await audioSource.silences({});
71-
expect(silences).toEqual([{
72-
start: new Timestamp(0),
73-
stop: new Timestamp(16000),
74-
}]);
75-
});
76-
77-
it('find silences correctly with too high minDuration', async () => {
78-
const audioBuffer = {
79-
duration: 16,
80-
sampleRate: 1000,
81-
length: 16000,
82-
getChannelData: () => new Float32Array(16000).fill(0), // Return a dummy Float32Array
83-
} as any as AudioBuffer;
84-
audioSource.audioBuffer = audioBuffer;
85-
const silences = await audioSource.silences({minDuration: 1e10});
86-
expect(silences).toEqual([{
87-
start: new Timestamp(0),
88-
stop: new Timestamp(16000),
89-
}]);
90-
});
91-
92-
it('find silences correctly after caching', async () => {
93-
const silences = await audioSource.silences({});
94-
const cachedSilences = await audioSource.silences({threshold: 0, minDuration: 1e10, windowSize: 1e10});
95-
expect(silences).toEqual(cachedSilences);
96-
});
97-
9836
it('should decode an audio buffer correctly', async () => {
9937
const buffer = await audioSource.decode(2, 44100, true);
10038
expect(buffer.duration).toBe(5); // Mock duration

src/sources/audio.ts

Lines changed: 15 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,13 @@
66
*/
77

88
import { Source } from './source';
9+
import { detectSilences } from './audio.utils';
10+
import { MIN_SAMPLE_RATE } from './audio.fixtures';
911

1012
import type { ClipType } from '../clips';
1113
import type { ArgumentTypes } from '../types';
12-
import type { FastSamplerOptions, SilenceOptions } from './audio.types';
14+
import type { AudioSlice, FastSamplerOptions, SilenceDetectionOptions } from './audio.types';
1315
import type { Timestamp, Transcript } from '../models';
14-
import { findSilences } from './audio.utils';
15-
16-
const DEFAULT_SAMPLE_RATE = 3000;
1716

1817
export class AudioSource<T extends Object = {}> extends Source<T> {
1918
public readonly type: ClipType = 'audio';
@@ -89,12 +88,12 @@ export class AudioSource<T extends Object = {}> extends Source<T> {
8988
if (typeof start === 'object') start = start.millis;
9089
if (typeof stop === 'object') stop = stop.millis;
9190

92-
const audioBuffer = this.audioBuffer ?? (await this.decode(1, DEFAULT_SAMPLE_RATE, true));
91+
const audioBuffer = this.audioBuffer ?? (await this.decode(1, MIN_SAMPLE_RATE, true));
9392
const channelData = audioBuffer.getChannelData(0);
9493

95-
const firstSample = Math.floor(Math.max((start * DEFAULT_SAMPLE_RATE) / 1000, 0));
94+
const firstSample = Math.floor(Math.max((start * MIN_SAMPLE_RATE) / 1000, 0));
9695
const lastSample = stop
97-
? Math.floor(Math.min((stop * DEFAULT_SAMPLE_RATE) / 1000, audioBuffer.length))
96+
? Math.floor(Math.min((stop * MIN_SAMPLE_RATE) / 1000, audioBuffer.length))
9897
: audioBuffer.length;
9998

10099
const windowSize = Math.floor((lastSample - firstSample) / length);
@@ -137,20 +136,18 @@ export class AudioSource<T extends Object = {}> extends Source<T> {
137136
* @param options - Silences options.
138137
* @returns An array of the silences (in ms) in the clip.
139138
*/
140-
public async silences({
141-
threshold = -50,
142-
minDuration = 100,
143-
windowSize = 50,
144-
}: SilenceOptions = {}): Promise<{ start: Timestamp; stop: Timestamp }[]> {
139+
public async silences(options: SilenceDetectionOptions = {}): Promise<AudioSlice[]> {
145140
if (this._silences) return this._silences;
146141

147-
const audioBuffer = this.audioBuffer ?? (await this.decode(1, DEFAULT_SAMPLE_RATE, true));
148-
const length = Math.floor(audioBuffer.length / windowSize);
149-
const samples = await this.fastsampler({ length, logarithmic: false });
142+
const buffer = await this.arrayBuffer();
143+
144+
const ctx = new AudioContext();
145+
146+
const audioBuffer = await ctx.decodeAudioData(buffer);
147+
this._silences = detectSilences(audioBuffer, options);
150148

151-
const silences = findSilences(samples, threshold, minDuration, this.duration.millis);
152-
this._silences = silences;
149+
ctx.close();
153150

154-
return silences;
151+
return this._silences;
155152
}
156153
}

src/sources/audio.types.ts

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,17 +22,25 @@ export type FastSamplerOptions = {
2222
logarithmic?: boolean;
2323
};
2424

25-
export type SilenceOptions = {
25+
export type SilenceDetectionOptions = {
2626
/**
27-
* The threshold to use for the silence detection in db.
28-
*/
27+
* If the RMS is below the threshold, the frame is considered silent.
28+
* @default 0.02
29+
*/
2930
threshold?: number;
3031
/**
31-
* The minimum duration of a silence to be considered a silence in milliseconds.
32+
* This parameter affect how accurately the algorithm captures short silences.
33+
* @default 1024
3234
*/
33-
minDuration?: number;
35+
hopSize?: number;
3436
/**
35-
* The window size to use for the silence detection.
37+
* Setting a minimum duration for a silence period helps avoid detecting brief gaps between sounds as silences.
38+
* @default 0.5
3639
*/
37-
windowSize?: number;
40+
minDuration?: number;
41+
};
42+
43+
export type AudioSlice = {
44+
start: Timestamp;
45+
stop: Timestamp;
3846
};

src/sources/audio.utils.ts

Lines changed: 50 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,56 +1,69 @@
11
import { Timestamp } from '../models';
22

3+
import type { AudioSlice, SilenceDetectionOptions } from './audio.types';
4+
5+
36
/**
4-
* Find the silences in an audio clip.
5-
* @param samples - The sub-sampled samples of the audio clip.
6-
* @param threshold - The threshold to use for the silence detection in db.
7-
* @param minDuration - The minimum duration of a silence to be considered a silence in milliseconds.
8-
* @param duration - The length of the audio clip in milliseconds.
7+
* Detect silences in an audio buffer
8+
* @param audioBuffer - The web audio buffer.
9+
* @param threshold - The threshold for silence detection.
10+
* @param hopSize - The hop size between frames in samples.
11+
* @param minDuration - Minimum duration for a silence in seconds.
912
* @returns An array of the silences in the clip.
1013
*/
11-
export function findSilences(
12-
samples: Float32Array,
13-
threshold: number,
14-
minDuration: number,
15-
duration: number,
16-
): { start: Timestamp; stop: Timestamp }[] {
17-
const decibelValues = samples.map((sample) => 20 * Math.log10(Math.max(Math.abs(sample), 1e-10)));
18-
const silences: { start: Timestamp; stop: Timestamp }[] = [];
19-
20-
// Find silence periods in this clip
14+
export function detectSilences(
15+
audioBuffer: AudioBuffer,
16+
options: SilenceDetectionOptions = {}
17+
): AudioSlice[] {
18+
const { threshold = 0.02, hopSize = 1024, minDuration = 0.5 } = options;
19+
20+
const slices: AudioSlice[] = [];
21+
const channel = audioBuffer.getChannelData(0);
22+
const sampleRate = audioBuffer.sampleRate;
23+
24+
// Convert minDuration from seconds to samples
25+
const minSamples = Math.floor(minDuration * sampleRate);
26+
2127
let silenceStart: number | null = null;
28+
let consecutiveSilentSamples = 0;
2229

23-
for (let i = 0; i < decibelValues.length; i++) {
24-
if (decibelValues[i] < threshold) {
30+
// Process audio in frames
31+
for (let i = 0; i < channel.length; i += hopSize) {
32+
// Calculate RMS for current frame
33+
let rms = 0;
34+
const frameEnd = Math.min(i + hopSize, channel.length);
35+
36+
for (let j = i; j < frameEnd; j++) {
37+
rms += channel[j] * channel[j];
38+
}
39+
rms = Math.sqrt(rms / (frameEnd - i));
40+
41+
// Check if frame is silent
42+
if (rms < threshold) {
43+
consecutiveSilentSamples += hopSize;
2544
if (silenceStart === null) {
2645
silenceStart = i;
2746
}
28-
} else if (silenceStart !== null) {
29-
const silenceDuration = ((i - silenceStart) * duration) / decibelValues.length;
30-
if (silenceDuration >= minDuration) {
31-
// Convert chunk indices to seconds and adjust for clip offset
32-
const silenceStartFrame = Math.round((silenceStart * duration) / decibelValues.length);
33-
const silenceStopFrame = Math.round((i * duration) / decibelValues.length);
34-
35-
silences.push({
36-
start: new Timestamp(silenceStartFrame),
37-
stop: new Timestamp(silenceStopFrame),
47+
} else {
48+
// If we had a silence of sufficient duration, add it to slices
49+
if (silenceStart !== null && consecutiveSilentSamples >= minSamples) {
50+
slices.push({
51+
start: Timestamp.fromSeconds(silenceStart / sampleRate),
52+
stop: Timestamp.fromSeconds(i / sampleRate)
3853
});
3954
}
4055
silenceStart = null;
56+
consecutiveSilentSamples = 0;
4157
}
4258
}
4359

44-
// Handle silence at end of clip
45-
if (silenceStart !== null) {
46-
const silenceDuration = decibelValues.length - silenceStart;
47-
if (silenceDuration >= minDuration || silenceDuration == decibelValues.length) {
48-
silences.push({
49-
start: new Timestamp(Math.round((silenceStart * duration) / decibelValues.length)),
50-
stop: new Timestamp(duration),
51-
});
52-
}
60+
// Handle silence at the end of audio
61+
if (silenceStart !== null && consecutiveSilentSamples >= minSamples) {
62+
slices.push({
63+
start: Timestamp.fromSeconds(silenceStart / sampleRate),
64+
stop: Timestamp.fromSeconds(channel.length / sampleRate)
65+
});
5366
}
5467

55-
return silences;
68+
return slices;
5669
}

0 commit comments

Comments
 (0)