Skip to content

Commit 4b01cb3

Browse files
authored
Merge pull request #45 from diffusionstudio/matthias/feature/detect-pauses
Matthias/feature/detect pauses
2 parents 8b15c18 + 055d0aa commit 4b01cb3

File tree

11 files changed

+349
-48
lines changed

11 files changed

+349
-48
lines changed

playground/main.ts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,16 @@ const image = await composition.add(
4242
})
4343
);
4444

45+
const audioTrack = composition.createTrack('audio').stacked(true);
46+
const audioSource = await core.AudioSource.from('/harvard.MP3');
47+
await audioTrack.add(
48+
await new core.AudioClip(audioSource)
49+
);
50+
await audioTrack.removeSilences({
51+
minDuration: 300,
52+
windowSize: 1,
53+
});
54+
4555
image.animate()
4656
.rotation(-16).to(14, 5).to(-7, 10).to(24, 7).to(-3, 9).to(19, 7).to(-14, 12).to(5, 9).to(-30, 13)
4757
.translateX(1700, 0, 'easeOut').to(-1400, 40)
@@ -61,6 +71,7 @@ await composition.add(
6171

6272
(await composition.add(
6373
new core.AudioClip(await core.AudioSource.from('/audio.mp3'), {
74+
muted: true,
6475
transcript: core.Transcript.fromJSON(captions).optimize(),
6576
})
6677
)).addCaptions();

public/harvard.MP3

367 KB
Binary file not shown.

public/silences.mp3

43.1 KB
Binary file not shown.

src/clips/media/media.ts

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
/**
22
* Copyright (c) 2024 The Diffusion Studio Authors
33
*
4-
* This Source Code Form is subject to the terms of the Mozilla
4+
* This Source Code Form is subject to the terms of the Mozilla
55
* Public License, v. 2.0 that can be found in the LICENSE file.
66
*/
77

88
import { Timestamp, Transcript } from '../../models';
99
import { AudioSource } from '../../sources';
1010
import { RangeDeserializer } from './media.deserializer';
11-
import { serializable, } from '../../services';
11+
import { serializable } from '../../services';
1212
import { replaceKeyframes } from '../clip/clip.utils';
1313
import { ReferenceError, ValidationError } from '../../errors';
1414
import { Clip } from '../clip';
@@ -17,7 +17,6 @@ import type { CaptionPresetStrategy, CaptionTrack } from '../../tracks';
1717
import type { float, frame } from '../../types';
1818
import type { MediaClipProps } from './media.interfaces';
1919

20-
2120
export class MediaClip<Props extends MediaClipProps = MediaClipProps> extends Clip<MediaClipProps> {
2221
public source = new AudioSource();
2322
public declare element?: HTMLAudioElement | HTMLVideoElement;
@@ -54,7 +53,7 @@ export class MediaClip<Props extends MediaClipProps = MediaClipProps> extends Cl
5453
@serializable(Transcript)
5554
public get transcript(): Transcript | undefined {
5655
return this.source.transcript;
57-
};
56+
}
5857

5958
public set transcript(transcript: Transcript | undefined) {
6059
this.source.transcript = transcript;
@@ -170,10 +169,12 @@ export class MediaClip<Props extends MediaClipProps = MediaClipProps> extends Cl
170169
public seek(time: Timestamp): Promise<void> {
171170
return new Promise((resolve, reject) => {
172171
if (!this.element) {
173-
return reject(new ReferenceError({
174-
code: 'elementNotDefined',
175-
message: 'Cannot seek on undefined element',
176-
}));
172+
return reject(
173+
new ReferenceError({
174+
code: 'elementNotDefined',
175+
message: 'Cannot seek on undefined element',
176+
}),
177+
);
177178
}
178179
if (time.millis < this.start.millis || time.millis > this.stop.millis) {
179180
time = this.start;
@@ -205,7 +206,7 @@ export class MediaClip<Props extends MediaClipProps = MediaClipProps> extends Cl
205206
if (start.millis >= stop.millis) {
206207
throw new ValidationError({
207208
code: 'invalidKeyframe',
208-
message: "Start can't lower than or equal the stop"
209+
message: "Start can't lower than or equal the stop",
209210
});
210211
}
211212
// start and/or stop are out of bounds
@@ -285,18 +286,17 @@ export class MediaClip<Props extends MediaClipProps = MediaClipProps> extends Cl
285286
* Generates a new caption track for the current clip using the specified captioning strategy.
286287
* @param strategy An optional CaptionPresetStrategy to define how captions should be generated.
287288
*/
288-
public async addCaptions(strategy?: CaptionPresetStrategy | (new () => CaptionPresetStrategy)): Promise<CaptionTrack> {
289+
public async addCaptions(
290+
strategy?: CaptionPresetStrategy | (new () => CaptionPresetStrategy),
291+
): Promise<CaptionTrack> {
289292
if (!this.track?.composition) {
290293
throw new ValidationError({
291294
code: 'compositionNotDefined',
292295
message: 'Captions can only be generated after the clip has been added to the composition',
293296
});
294297
}
295298

296-
const track = await this.track.composition
297-
.createTrack('caption')
298-
.from(this)
299-
.generate(strategy);
299+
const track = await this.track.composition.createTrack('caption').from(this).generate(strategy);
300300

301301
return track;
302302
}
@@ -308,7 +308,9 @@ export class MediaClip<Props extends MediaClipProps = MediaClipProps> extends Cl
308308
/**
309309
* @deprecated use `addCaptions` instead
310310
*/
311-
public async generateCaptions(strategy?: CaptionPresetStrategy | (new () => CaptionPresetStrategy)) {
311+
public async generateCaptions(
312+
strategy?: CaptionPresetStrategy | (new () => CaptionPresetStrategy),
313+
) {
312314
return this.addCaptions(strategy);
313315
}
314316
}

src/clips/video/video.ts

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/**
22
* Copyright (c) 2024 The Diffusion Studio Authors
33
*
4-
* This Source Code Form is subject to the terms of the Mozilla
4+
* This Source Code Form is subject to the terms of the Mozilla
55
* Public License, v. 2.0 that can be found in the LICENSE file.
66
*/
77

@@ -51,7 +51,7 @@ export class VideoClip extends VisualMixin(MediaClip<VideoClipProps>) {
5151
this.element.controls = false;
5252
this.element.playsInline = true;
5353
this.element.style.display = 'hidden';
54-
this.element.crossOrigin = "anonymous";
54+
this.element.crossOrigin = 'anonymous';
5555

5656
(this.textrues.html5.source as any).autoPlay = false;
5757
(this.textrues.html5.source as any).loop = false;
@@ -87,7 +87,7 @@ export class VideoClip extends VisualMixin(MediaClip<VideoClipProps>) {
8787

8888
this.state = 'READY';
8989
resolve();
90-
}
90+
};
9191

9292
this.element.onerror = () => {
9393
this.state = 'ERROR';
@@ -98,7 +98,7 @@ export class VideoClip extends VisualMixin(MediaClip<VideoClipProps>) {
9898
});
9999

100100
reject(this.element.error ?? error);
101-
}
101+
};
102102
});
103103
}
104104

@@ -132,7 +132,7 @@ export class VideoClip extends VisualMixin(MediaClip<VideoClipProps>) {
132132
public exit(): void {
133133
if (this.playing) {
134134
this.element.pause();
135-
};
135+
}
136136
if (this.filters && this.view.filters) {
137137
this.view.filters = null as any;
138138
}

src/sources/audio.spec.ts

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77

88
import { describe, it, vi, beforeEach, expect } from 'vitest';
99
import { AudioSource } from './audio'; // Import the AudioSource class
10+
import { findSilences } from './audio.utils';
11+
import { Timestamp } from '../models';
1012

1113
// Mocking the OfflineAudioContext class
1214
class MockOfflineAudioContext {
@@ -25,6 +27,30 @@ class MockOfflineAudioContext {
2527

2628
vi.stubGlobal('OfflineAudioContext', MockOfflineAudioContext); // Stub the global OfflineAudioContext
2729

30+
describe('AudioUtils', () => {
31+
it('all silent', () => {
32+
const silences = findSilences(new Float32Array(100).fill(0), -50, 100, 100);
33+
expect(silences).toEqual([{
34+
start: new Timestamp(0),
35+
stop: new Timestamp(100),
36+
}]);
37+
});
38+
39+
it('no silences', () => {
40+
const silences = findSilences(new Float32Array(100).fill(1), -50, 100, 100);
41+
expect(silences).toEqual([]);
42+
});
43+
44+
it('find silences correctly', () => {
45+
const samples = Array.from({ length: 500 }, (_, index) => index > 300 ? (index < 400 ? 0 : 1) : -1);
46+
const silences = findSilences(new Float32Array(samples), -50, 100, 5000);
47+
expect(silences).toEqual([{
48+
start: new Timestamp(3010),
49+
stop: new Timestamp(4000),
50+
}]);
51+
});
52+
});
53+
2854
describe('AudioSource', () => {
2955
let audioSource: AudioSource;
3056

@@ -33,6 +59,42 @@ describe('AudioSource', () => {
3359
audioSource.file = new File([], 'audio.mp3', { type: 'audio/mp3' });
3460
});
3561

62+
it('find silences correctly', async () => {
63+
const audioBuffer = {
64+
duration: 16,
65+
sampleRate: 1000,
66+
length: 16000,
67+
getChannelData: () => new Float32Array(16000).fill(0), // Return a dummy Float32Array
68+
} as any as AudioBuffer;
69+
audioSource.audioBuffer = audioBuffer;
70+
const silences = await audioSource.silences({});
71+
expect(silences).toEqual([{
72+
start: new Timestamp(0),
73+
stop: new Timestamp(16000),
74+
}]);
75+
});
76+
77+
it('find silences correctly with too high minDuration', async () => {
78+
const audioBuffer = {
79+
duration: 16,
80+
sampleRate: 1000,
81+
length: 16000,
82+
getChannelData: () => new Float32Array(16000).fill(0), // Return a dummy Float32Array
83+
} as any as AudioBuffer;
84+
audioSource.audioBuffer = audioBuffer;
85+
const silences = await audioSource.silences({minDuration: 1e10});
86+
expect(silences).toEqual([{
87+
start: new Timestamp(0),
88+
stop: new Timestamp(16000),
89+
}]);
90+
});
91+
92+
it('find silences correctly after caching', async () => {
93+
const silences = await audioSource.silences({});
94+
const cachedSilences = await audioSource.silences({threshold: 0, minDuration: 1e10, windowSize: 1e10});
95+
expect(silences).toEqual(cachedSilences);
96+
});
97+
3698
it('should decode an audio buffer correctly', async () => {
3799
const buffer = await audioSource.decode(2, 44100, true);
38100
expect(buffer.duration).toBe(5); // Mock duration

src/sources/audio.ts

Lines changed: 40 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,24 @@
11
/**
22
* Copyright (c) 2024 The Diffusion Studio Authors
33
*
4-
* This Source Code Form is subject to the terms of the Mozilla
4+
* This Source Code Form is subject to the terms of the Mozilla
55
* Public License, v. 2.0 that can be found in the LICENSE file.
66
*/
77

88
import { Source } from './source';
99

1010
import type { ClipType } from '../clips';
1111
import type { ArgumentTypes } from '../types';
12-
import type { FastSamplerOptions } from './audio.types';
13-
import type { Transcript } from '../models';
12+
import type { FastSamplerOptions, SilenceOptions } from './audio.types';
13+
import type { Timestamp, Transcript } from '../models';
14+
import { findSilences } from './audio.utils';
15+
16+
const DEFAULT_SAMPLE_RATE = 3000;
1417

1518
export class AudioSource<T extends Object = {}> extends Source<T> {
1619
public readonly type: ClipType = 'audio';
1720
private decoding = false;
21+
private _silences?: { start: Timestamp; stop: Timestamp }[];
1822

1923
public transcript?: Transcript;
2024
public audioBuffer?: AudioBuffer;
@@ -76,17 +80,21 @@ export class AudioSource<T extends Object = {}> extends Source<T> {
7680
* @param options - Sampling options.
7781
* @returns An array of the max values of the samples in the window.
7882
*/
79-
public async fastsampler({ length = 60, start = 0, stop, logarithmic = false }: FastSamplerOptions): Promise<Float32Array> {
83+
public async fastsampler({
84+
length = 60,
85+
start = 0,
86+
stop,
87+
logarithmic = false,
88+
}: FastSamplerOptions = {}): Promise<Float32Array> {
8089
if (typeof start === 'object') start = start.millis;
8190
if (typeof stop === 'object') stop = stop.millis;
8291

83-
const sampleRate = 3000;
84-
const audioBuffer = this.audioBuffer ?? (await this.decode(1, sampleRate, true));
92+
const audioBuffer = this.audioBuffer ?? (await this.decode(1, DEFAULT_SAMPLE_RATE, true));
8593
const channelData = audioBuffer.getChannelData(0);
8694

87-
const firstSample = Math.floor(Math.max(start * sampleRate / 1000, 0));
95+
const firstSample = Math.floor(Math.max((start * DEFAULT_SAMPLE_RATE) / 1000, 0));
8896
const lastSample = stop
89-
? Math.floor(Math.min(stop * sampleRate / 1000, audioBuffer.length))
97+
? Math.floor(Math.min((stop * DEFAULT_SAMPLE_RATE) / 1000, audioBuffer.length))
9098
: audioBuffer.length;
9199

92100
const windowSize = Math.floor((lastSample - firstSample) / length);
@@ -121,4 +129,28 @@ export class AudioSource<T extends Object = {}> extends Source<T> {
121129
}
122130
return div;
123131
}
132+
133+
/**
134+
* Find silences in the audio clip. Results are cached.
135+
*
136+
* uses default sample rate of 3000
137+
* @param options - Silences options.
138+
* @returns An array of the silences (in ms) in the clip.
139+
*/
140+
public async silences({
141+
threshold = -50,
142+
minDuration = 100,
143+
windowSize = 50,
144+
}: SilenceOptions = {}): Promise<{ start: Timestamp; stop: Timestamp }[]> {
145+
if (this._silences) return this._silences;
146+
147+
const audioBuffer = this.audioBuffer ?? (await this.decode(1, DEFAULT_SAMPLE_RATE, true));
148+
const length = Math.floor(audioBuffer.length / windowSize);
149+
const samples = await this.fastsampler({ length, logarithmic: false });
150+
151+
const silences = findSilences(samples, threshold, minDuration, this.duration.millis);
152+
this._silences = silences;
153+
154+
return silences;
155+
}
124156
}

src/sources/audio.types.ts

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,35 @@ import type { Timestamp } from '../models';
44
* Fast sampler options.
55
*/
66
export type FastSamplerOptions = {
7-
/**
8-
* The number of samples to return.
9-
*/
10-
length?: number;
11-
/**
12-
* The start time in **milliseconds** relative to the beginning of the clip.
13-
*/
14-
start?: Timestamp | number;
15-
/**
16-
* The stop time in **milliseconds** relative to the beginning of the clip.
17-
*/
18-
stop?: Timestamp | number;
19-
/**
20-
* Whether to use a logarithmic scale.
21-
*/
22-
logarithmic?: boolean;
7+
/**
8+
* The number of samples to return.
9+
*/
10+
length?: number;
11+
/**
12+
* The start time in **milliseconds** relative to the beginning of the clip.
13+
*/
14+
start?: Timestamp | number;
15+
/**
16+
* The stop time in **milliseconds** relative to the beginning of the clip.
17+
*/
18+
stop?: Timestamp | number;
19+
/**
20+
* Whether to use a logarithmic scale.
21+
*/
22+
logarithmic?: boolean;
23+
};
24+
25+
export type SilenceOptions = {
26+
/**
27+
* The threshold to use for the silence detection in db.
28+
*/
29+
threshold?: number;
30+
/**
31+
* The minimum duration of a silence to be considered a silence in milliseconds.
32+
*/
33+
minDuration?: number;
34+
/**
35+
* The window size to use for the silence detection.
36+
*/
37+
windowSize?: number;
2338
};

0 commit comments

Comments
 (0)