added find silences to audio source

Matze99 · Matze99 · commit c99cd44c2182 · 2024-11-06T15:39:40.000-08:00
diff --git a/playground/main.ts b/playground/main.ts
@@ -34,10 +34,15 @@ const image = await composition.add(
 
 const audioTrack = new core.AudioTrack();
 
-const audioTest = await new core.AudioClip(await core.AudioSource
-    .from('/silences.mp3'), {
-    volume: 0.1,
-  });
+const audioSource = await core.AudioSource.from('/silences.mp3');
+
+const silences = await audioSource.silences({});
+console.log(silences);
+
+const audioTest = await new core.AudioClip(audioSource, {
+  volume: 0.1,
+});
+console.log("duration", audioTest.duration.millis);
 
 await audioTrack.add(audioTest);
 
diff --git a/src/sources/audio.spec.ts b/src/sources/audio.spec.ts
@@ -7,6 +7,8 @@
 
 import { describe, it, vi, beforeEach, expect } from 'vitest';
 import { AudioSource } from './audio'; // Import the AudioSource class
+import { findSilences } from './audio.utils';
+import { Timestamp } from '../models';
 
 // Mocking the OfflineAudioContext class
 class MockOfflineAudioContext {
@@ -25,6 +27,33 @@ class MockOfflineAudioContext {
 
 vi.stubGlobal('OfflineAudioContext', MockOfflineAudioContext); // Stub the global OfflineAudioContext
 
+describe('AudioUtils', () => {
+	it('all silent', () => {
+		const silences = findSilences(new Float32Array(100).fill(1), -50, 100, 100);
+		expect(silences).toEqual([{
+			start: new Timestamp(0),
+			stop: new Timestamp(100),
+		}]);
+	});
+
+	it('no silences', () => {
+		const silences = findSilences(new Float32Array(100).fill(0), -50, 100, 100);
+		expect(silences).toEqual([]);
+	});
+
+	it('find silences correctly', () => {
+		const samples = Array.from({ length: 500 }, (_, index) => index > 300 ? (index < 400 ? 0 : 1) : -1);
+		const silences = findSilences(new Float32Array(samples), -50, 100, 5000);
+		expect(silences).toEqual([{
+			start: new Timestamp(0),
+			stop: new Timestamp(3010),
+		}, {
+			start: new Timestamp(4000),
+			stop: new Timestamp(5000),
+		}]);
+	});
+});
+
 describe('AudioSource', () => {
 	let audioSource: AudioSource;
 
@@ -33,6 +62,14 @@ describe('AudioSource', () => {
 		audioSource.file = new File([], 'audio.mp3', { type: 'audio/mp3' });
 	});
 
+	it('find silences correctly', async () => {
+		const silences = await audioSource.silences({});
+		expect(silences).toEqual([{
+			start: new Timestamp(0),
+			stop: new Timestamp(5000),
+		}]);
+	});
+
 	it('should decode an audio buffer correctly', async () => {
 		const buffer = await audioSource.decode(2, 44100, true);
 		expect(buffer.duration).toBe(5); // Mock duration
diff --git a/src/sources/audio.ts b/src/sources/audio.ts
@@ -1,16 +1,19 @@
 /**
  * Copyright (c) 2024 The Diffusion Studio Authors
  *
- * This Source Code Form is subject to the terms of the Mozilla 
+ * This Source Code Form is subject to the terms of the Mozilla
  * Public License, v. 2.0 that can be found in the LICENSE file.
  */
 
 import { Source } from './source';
 
 import type { ClipType } from '../clips';
 import type { ArgumentTypes } from '../types';
-import type { FastSamplerOptions } from './audio.types';
-import type { Transcript } from '../models';
+import type { FastSamplerOptions, SilenceOptions } from './audio.types';
+import type { Timestamp, Transcript } from '../models';
+import { findSilences } from './audio.utils';
+
+const DEFAULT_SAMPLE_RATE = 3000;
 
 export class AudioSource<T extends Object = {}> extends Source<T> {
 	public readonly type: ClipType = 'audio';
@@ -76,17 +79,21 @@ export class AudioSource<T extends Object = {}> extends Source<T> {
 	 * @param options - Sampling options.
 	 * @returns An array of the max values of the samples in the window.
 	 */
-	public async fastsampler({ length = 60, start = 0, stop, logarithmic = false }: FastSamplerOptions): Promise<Float32Array> {
+	public async fastsampler({
+		length = 60,
+		start = 0,
+		stop,
+		logarithmic = false,
+	}: FastSamplerOptions): Promise<Float32Array> {
 		if (typeof start === 'object') start = start.millis;
 		if (typeof stop === 'object') stop = stop.millis;
 
-		const sampleRate = 3000;
-		const audioBuffer = this.audioBuffer ?? (await this.decode(1, sampleRate, true));
+		const audioBuffer = this.audioBuffer ?? (await this.decode(1, DEFAULT_SAMPLE_RATE, true));
 		const channelData = audioBuffer.getChannelData(0);
 
-		const firstSample = Math.floor(Math.max(start * sampleRate / 1000, 0));
+		const firstSample = Math.floor(Math.max((start * DEFAULT_SAMPLE_RATE) / 1000, 0));
 		const lastSample = stop
-			? Math.floor(Math.min(stop * sampleRate / 1000, audioBuffer.length))
+			? Math.floor(Math.min((stop * DEFAULT_SAMPLE_RATE) / 1000, audioBuffer.length))
 			: audioBuffer.length;
 
 		const windowSize = Math.floor((lastSample - firstSample) / length);
@@ -121,4 +128,25 @@ export class AudioSource<T extends Object = {}> extends Source<T> {
 		}
 		return div;
 	}
+
+	/**
+	 * Find silences in the audio clip
+	 * 
+	 * uses default sample rate of 3000
+	 * @param options - Silences options.
+	 * @returns An array of the silences (in ms) in the clip.
+	 */
+	public async silences({
+		threshold = -50,
+		minDuration = 5,
+		windowSize = 50,
+	}: SilenceOptions): Promise<{ start: Timestamp; stop: Timestamp }[]> {
+		const audioBuffer = this.audioBuffer ?? (await this.decode(1, DEFAULT_SAMPLE_RATE, true));
+		const length = Math.floor(audioBuffer.length / windowSize);
+		const samples = await this.fastsampler({ length, logarithmic: false });
+
+		const silences = findSilences(samples, threshold, minDuration, this.duration.millis);
+
+		return silences;
+	}
 }
diff --git a/src/sources/audio.types.ts b/src/sources/audio.types.ts
@@ -4,20 +4,35 @@ import type { Timestamp } from '../models';
  * Fast sampler options.
  */
 export type FastSamplerOptions = {
-  /**
-   * The number of samples to return.
-   */
-  length?: number;
-  /**
-   * The start time in **milliseconds** relative to the beginning of the clip.
-   */
-  start?: Timestamp | number;
-  /**
-   * The stop time in **milliseconds** relative to the beginning of the clip.
-   */
-  stop?: Timestamp | number;
-  /**
-   * Whether to use a logarithmic scale.
-   */
-  logarithmic?: boolean;
+	/**
+	 * The number of samples to return.
+	 */
+	length?: number;
+	/**
+	 * The start time in **milliseconds** relative to the beginning of the clip.
+	 */
+	start?: Timestamp | number;
+	/**
+	 * The stop time in **milliseconds** relative to the beginning of the clip.
+	 */
+	stop?: Timestamp | number;
+	/**
+	 * Whether to use a logarithmic scale.
+	 */
+	logarithmic?: boolean;
+};
+
+export type SilenceOptions = {
+	/**
+	 * The threshold to use for the silence detection in db.
+	 */	
+	threshold?: number;
+	/**
+	 * The minimum duration of a silence to be considered a silence in milliseconds.
+	 */
+	minDuration?: number;
+	/**
+	 * The window size to use for the silence detection.
+	 */
+	windowSize?: number;
 };
diff --git a/src/sources/audio.utils.ts b/src/sources/audio.utils.ts
@@ -0,0 +1,56 @@
+import { Timestamp } from '../models';
+
+/**
+ * Find the silences in an audio clip.
+ * @param samples - The sub-sampled samples of the audio clip.
+ * @param threshold - The threshold to use for the silence detection in db.
+ * @param minDuration - The minimum duration of a silence to be considered a silence in milliseconds.
+ * @param duration - The length of the audio clip in milliseconds.
+ * @returns An array of the silences in the clip.
+ */
+export function findSilences(
+	samples: Float32Array,
+	threshold: number,
+	minDuration: number,
+	duration: number,
+): { start: Timestamp; stop: Timestamp }[] {
+	const decibelValues = samples.map((sample) => 20 * Math.log10(Math.max(Math.abs(sample), 1e-10)));
+	const silences: { start: Timestamp; stop: Timestamp }[] = [];
+
+	// Find silence periods in this clip
+	let silenceStart: number | null = null;
+
+	for (let i = 0; i < decibelValues.length; i++) {
+		if (decibelValues[i] > threshold) {
+			if (silenceStart === null) {
+				silenceStart = i;
+			}
+		} else if (silenceStart !== null) {
+			const silenceDuration = ((i - silenceStart) * duration) / decibelValues.length;
+			if (silenceDuration >= minDuration) {
+				// Convert chunk indices to seconds and adjust for clip offset
+				const silenceStartFrame = Math.round((silenceStart * duration) / decibelValues.length);
+				const silenceStopFrame = Math.round((i * duration) / decibelValues.length);
+
+				silences.push({
+					start: new Timestamp(silenceStartFrame),
+					stop: new Timestamp(silenceStopFrame),
+				});
+			}
+			silenceStart = null;
+		}
+	}
+
+	// Handle silence at end of clip
+	if (silenceStart !== null) {
+		const silenceDuration = decibelValues.length - silenceStart;
+		if (silenceDuration >= minDuration || silenceDuration == decibelValues.length) {
+			silences.push({
+				start: new Timestamp(Math.round((silenceStart * duration) / decibelValues.length)),
+				stop: new Timestamp(duration),
+			});
+		}
+	}
+
+	return silences;
+}