Enhance silence detection and removal functionality in audio clips

k9p5 · k9p5 · commit 437f2cbb111d · 2024-12-01T14:39:53.000-08:00
- Added `removeSilences` method to `MediaClip` class for removing silence segments based on specified options.
- Introduced `SilenceDetectionOptions` type for improved configuration of silence detection parameters.
- Updated `AudioSource` class to utilize new silence detection logic, replacing the previous implementation.
- Refactored silence detection utility to process audio buffers more efficiently.
- Added `audio.fixtures.ts` for defining minimum sample rate constant.
- Removed outdated silence detection tests from `audio.spec.ts` to streamline testing focus.

These changes improve the performance and flexibility of silence handling in audio processing.
diff --git a/src/clips/media/media.ts b/src/clips/media/media.ts
@@ -6,7 +6,7 @@
  */
 
 import { Timestamp, Transcript } from '../../models';
-import { AudioSource } from '../../sources';
+import { AudioSource, SilenceDetectionOptions } from '../../sources';
 import { RangeDeserializer } from './media.deserializer';
 import { serializable } from '../../services';
 import { replaceKeyframes } from '../clip/clip.utils';
@@ -313,4 +313,58 @@ export class MediaClip<Props extends MediaClipProps = MediaClipProps> extends Cl
 	) {
 		return this.addCaptions(strategy);
 	}
+
+	/**
+	 * Remove silences from the clip
+	 *
+	 * @param options - Options for silence detection
+	 */
+	public async removeSilences(options: SilenceDetectionOptions = {}): Promise<MediaClip<Props>[]> {
+		const silences = (await this.source.silences(options))
+			.filter((silence) => inRange(silence, this.range))
+			.sort((a, b) => a.start.millis - b.start.millis);
+
+		if (silences.length == 0) {
+			return [this];
+		}
+
+		const result: MediaClip<Props>[] = [this];
+
+		for (const silence of silences) {
+			const item = result.at(-1);
+
+			if (!item) break;
+			if (!inRange(silence, item.range)) continue;
+
+			if (silence.start.millis > item.range[0].millis && silence.stop.millis < item.range[1].millis) {
+				const copy = item.copy();
+
+				item.range[1] = silence.start;
+				copy.range[0] = silence.stop;
+
+				result.push(copy);
+			} else if (silence.start.millis <= item.range[0].millis) {
+				item.range[0] = silence.stop;
+			} else if (silence.stop.millis >= item.range[1].millis) {
+				item.range[1] = silence.start;
+			}
+		}
+
+		return result;
+	}
+}
+
+function inRange(
+	silence: {
+		start: Timestamp;
+		stop: Timestamp;
+	},
+	range: [Timestamp, Timestamp],
+): boolean {
+	return (
+		(silence.start.millis >= range[0].millis &&
+			silence.start.millis <= range[1].millis) ||
+		(silence.stop.millis <= range[1].millis &&
+			silence.stop.millis >= range[0].millis)
+	)
 }
diff --git a/src/sources/audio.fixtures.ts b/src/sources/audio.fixtures.ts
@@ -0,0 +1 @@
+export const MIN_SAMPLE_RATE = 3000;
diff --git a/src/sources/audio.spec.ts b/src/sources/audio.spec.ts
@@ -6,9 +6,7 @@
  */
 
 import { describe, it, vi, beforeEach, expect } from 'vitest';
-import { AudioSource } from './audio'; // Import the AudioSource class
-import { findSilences } from './audio.utils';
-import { Timestamp } from '../models';
+import { AudioSource } from './audio';
 
 // Mocking the OfflineAudioContext class
 class MockOfflineAudioContext {
@@ -27,30 +25,6 @@ class MockOfflineAudioContext {
 
 vi.stubGlobal('OfflineAudioContext', MockOfflineAudioContext); // Stub the global OfflineAudioContext
 
-describe('AudioUtils', () => {
-	it('all silent', () => {
-		const silences = findSilences(new Float32Array(100).fill(0), -50, 100, 100);
-		expect(silences).toEqual([{
-			start: new Timestamp(0),
-			stop: new Timestamp(100),
-		}]);
-	});
-
-	it('no silences', () => {
-		const silences = findSilences(new Float32Array(100).fill(1), -50, 100, 100);
-		expect(silences).toEqual([]);
-	});
-
-	it('find silences correctly', () => {
-		const samples = Array.from({ length: 500 }, (_, index) => index > 300 ? (index < 400 ? 0 : 1) : -1);
-		const silences = findSilences(new Float32Array(samples), -50, 100, 5000);
-		expect(silences).toEqual([{
-			start: new Timestamp(3010),
-			stop: new Timestamp(4000),
-		}]);
-	});
-});
-
 describe('AudioSource', () => {
 	let audioSource: AudioSource;
 
@@ -59,42 +33,6 @@ describe('AudioSource', () => {
 		audioSource.file = new File([], 'audio.mp3', { type: 'audio/mp3' });
 	});
 
-	it('find silences correctly', async () => {
-		const audioBuffer = {
-			duration: 16,
-			sampleRate: 1000,
-			length: 16000,
-			getChannelData: () => new Float32Array(16000).fill(0), // Return a dummy Float32Array
-		} as any as AudioBuffer;
-		audioSource.audioBuffer = audioBuffer;
-		const silences = await audioSource.silences({});
-		expect(silences).toEqual([{
-			start: new Timestamp(0),
-			stop: new Timestamp(16000),
-		}]);
-	});
-
-	it('find silences correctly with too high minDuration', async () => {
-		const audioBuffer = {
-			duration: 16,
-			sampleRate: 1000,
-			length: 16000,
-			getChannelData: () => new Float32Array(16000).fill(0), // Return a dummy Float32Array
-		} as any as AudioBuffer;
-		audioSource.audioBuffer = audioBuffer;
-		const silences = await audioSource.silences({minDuration: 1e10});
-		expect(silences).toEqual([{
-			start: new Timestamp(0),
-			stop: new Timestamp(16000),
-		}]);
-	});
-
-	it('find silences correctly after caching', async () => {
-		const silences = await audioSource.silences({});
-		const cachedSilences = await audioSource.silences({threshold: 0, minDuration: 1e10, windowSize: 1e10});
-		expect(silences).toEqual(cachedSilences);
-	});
-
 	it('should decode an audio buffer correctly', async () => {
 		const buffer = await audioSource.decode(2, 44100, true);
 		expect(buffer.duration).toBe(5); // Mock duration
diff --git a/src/sources/audio.ts b/src/sources/audio.ts
@@ -6,14 +6,13 @@
  */
 
 import { Source } from './source';
+import { detectSilences } from './audio.utils';
+import { MIN_SAMPLE_RATE } from './audio.fixtures';
 
 import type { ClipType } from '../clips';
 import type { ArgumentTypes } from '../types';
-import type { FastSamplerOptions, SilenceOptions } from './audio.types';
+import type { AudioSlice, FastSamplerOptions, SilenceDetectionOptions } from './audio.types';
 import type { Timestamp, Transcript } from '../models';
-import { findSilences } from './audio.utils';
-
-const DEFAULT_SAMPLE_RATE = 3000;
 
 export class AudioSource<T extends Object = {}> extends Source<T> {
 	public readonly type: ClipType = 'audio';
@@ -89,12 +88,12 @@ export class AudioSource<T extends Object = {}> extends Source<T> {
 		if (typeof start === 'object') start = start.millis;
 		if (typeof stop === 'object') stop = stop.millis;
 
-		const audioBuffer = this.audioBuffer ?? (await this.decode(1, DEFAULT_SAMPLE_RATE, true));
+		const audioBuffer = this.audioBuffer ?? (await this.decode(1, MIN_SAMPLE_RATE, true));
 		const channelData = audioBuffer.getChannelData(0);
 
-		const firstSample = Math.floor(Math.max((start * DEFAULT_SAMPLE_RATE) / 1000, 0));
+		const firstSample = Math.floor(Math.max((start * MIN_SAMPLE_RATE) / 1000, 0));
 		const lastSample = stop
-			? Math.floor(Math.min((stop * DEFAULT_SAMPLE_RATE) / 1000, audioBuffer.length))
+			? Math.floor(Math.min((stop * MIN_SAMPLE_RATE) / 1000, audioBuffer.length))
 			: audioBuffer.length;
 
 		const windowSize = Math.floor((lastSample - firstSample) / length);
@@ -137,20 +136,18 @@ export class AudioSource<T extends Object = {}> extends Source<T> {
 	 * @param options - Silences options.
 	 * @returns An array of the silences (in ms) in the clip.
 	 */
-	public async silences({
-		threshold = -50,
-		minDuration = 100,
-		windowSize = 50,
-	}: SilenceOptions = {}): Promise<{ start: Timestamp; stop: Timestamp }[]> {
+	public async silences(options: SilenceDetectionOptions = {}): Promise<AudioSlice[]> {
 		if (this._silences) return this._silences;
 
-		const audioBuffer = this.audioBuffer ?? (await this.decode(1, DEFAULT_SAMPLE_RATE, true));
-		const length = Math.floor(audioBuffer.length / windowSize);
-		const samples = await this.fastsampler({ length, logarithmic: false });
+		const buffer = await this.arrayBuffer();
+
+		const ctx = new AudioContext();
+
+		const audioBuffer = await ctx.decodeAudioData(buffer);
+		this._silences = detectSilences(audioBuffer, options);
 
-		const silences = findSilences(samples, threshold, minDuration, this.duration.millis);
-		this._silences = silences;
+		ctx.close();
 
-		return silences;
+		return this._silences;
 	}
 }
diff --git a/src/sources/audio.types.ts b/src/sources/audio.types.ts
@@ -22,17 +22,25 @@ export type FastSamplerOptions = {
 	logarithmic?: boolean;
 };
 
-export type SilenceOptions = {
+export type SilenceDetectionOptions = {
 	/**
-	 * The threshold to use for the silence detection in db.
-	 */	
+	 * If the RMS is below the threshold, the frame is considered silent. 
+	 * @default 0.02
+	 */
 	threshold?: number;
 	/**
-	 * The minimum duration of a silence to be considered a silence in milliseconds.
+	 * This parameter affect how accurately the algorithm captures short silences. 
+	 * @default 1024
 	 */
-	minDuration?: number;
+	hopSize?: number;
 	/**
-	 * The window size to use for the silence detection.
+	 * Setting a minimum duration for a silence period helps avoid detecting brief gaps between sounds as silences. 
+	 * @default 0.5
 	 */
-	windowSize?: number;
+	minDuration?: number;
+};
+
+export type AudioSlice = {
+	start: Timestamp;
+	stop: Timestamp;
 };
diff --git a/src/sources/audio.utils.ts b/src/sources/audio.utils.ts
@@ -1,56 +1,69 @@
 import { Timestamp } from '../models';
 
+import type { AudioSlice, SilenceDetectionOptions } from './audio.types';
+
+
 /**
- * Find the silences in an audio clip.
- * @param samples - The sub-sampled samples of the audio clip.
- * @param threshold - The threshold to use for the silence detection in db.
- * @param minDuration - The minimum duration of a silence to be considered a silence in milliseconds.
- * @param duration - The length of the audio clip in milliseconds.
+ * Detect silences in an audio buffer
+ * @param audioBuffer - The web audio buffer.
+ * @param threshold - The threshold for silence detection.
+ * @param hopSize - The hop size between frames in samples.
+ * @param minDuration - Minimum duration for a silence in seconds.
  * @returns An array of the silences in the clip.
  */
-export function findSilences(
-	samples: Float32Array,
-	threshold: number,
-	minDuration: number,
-	duration: number,
-): { start: Timestamp; stop: Timestamp }[] {
-	const decibelValues = samples.map((sample) => 20 * Math.log10(Math.max(Math.abs(sample), 1e-10)));
-	const silences: { start: Timestamp; stop: Timestamp }[] = [];
-
-	// Find silence periods in this clip
+export function detectSilences(
+	audioBuffer: AudioBuffer,
+	options: SilenceDetectionOptions = {}
+): AudioSlice[] {
+	const { threshold = 0.02, hopSize = 1024, minDuration = 0.5 } = options;
+
+	const slices: AudioSlice[] = [];
+	const channel = audioBuffer.getChannelData(0);
+	const sampleRate = audioBuffer.sampleRate;
+
+	// Convert minDuration from seconds to samples
+	const minSamples = Math.floor(minDuration * sampleRate);
+
 	let silenceStart: number | null = null;
+	let consecutiveSilentSamples = 0;
 
-	for (let i = 0; i < decibelValues.length; i++) {
-		if (decibelValues[i] < threshold) {
+	// Process audio in frames
+	for (let i = 0; i < channel.length; i += hopSize) {
+		// Calculate RMS for current frame
+		let rms = 0;
+		const frameEnd = Math.min(i + hopSize, channel.length);
+
+		for (let j = i; j < frameEnd; j++) {
+			rms += channel[j] * channel[j];
+		}
+		rms = Math.sqrt(rms / (frameEnd - i));
+
+		// Check if frame is silent
+		if (rms < threshold) {
+			consecutiveSilentSamples += hopSize;
 			if (silenceStart === null) {
 				silenceStart = i;
 			}
-		} else if (silenceStart !== null) {
-			const silenceDuration = ((i - silenceStart) * duration) / decibelValues.length;
-			if (silenceDuration >= minDuration) {
-				// Convert chunk indices to seconds and adjust for clip offset
-				const silenceStartFrame = Math.round((silenceStart * duration) / decibelValues.length);
-				const silenceStopFrame = Math.round((i * duration) / decibelValues.length);
-
-				silences.push({
-					start: new Timestamp(silenceStartFrame),
-					stop: new Timestamp(silenceStopFrame),
+		} else {
+			// If we had a silence of sufficient duration, add it to slices
+			if (silenceStart !== null && consecutiveSilentSamples >= minSamples) {
+				slices.push({
+					start: Timestamp.fromSeconds(silenceStart / sampleRate),
+					stop: Timestamp.fromSeconds(i / sampleRate)
 				});
 			}
 			silenceStart = null;
+			consecutiveSilentSamples = 0;
 		}
 	}
 
-	// Handle silence at end of clip
-	if (silenceStart !== null) {
-		const silenceDuration = decibelValues.length - silenceStart;
-		if (silenceDuration >= minDuration || silenceDuration == decibelValues.length) {
-			silences.push({
-				start: new Timestamp(Math.round((silenceStart * duration) / decibelValues.length)),
-				stop: new Timestamp(duration),
-			});
-		}
+	// Handle silence at the end of audio
+	if (silenceStart !== null && consecutiveSilentSamples >= minSamples) {
+		slices.push({
+			start: Timestamp.fromSeconds(silenceStart / sampleRate),
+			stop: Timestamp.fromSeconds(channel.length / sampleRate)
+		});
 	}
 
-	return silences;
+	return slices;
 }
diff --git a/src/tracks/media/media.ts b/src/tracks/media/media.ts